From 9e55d798036d646cbc27cd0bb2c849ed2e48ce85 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 24 Jan 2025 09:35:35 +0100
Subject: [PATCH 01/72] Reapply "pageserver: revert flush backpressure"
 (#10270) (#10402)

This reapplies #10135. Just removing this flush backpressure without
further mitigations caused read amp increases during bulk ingestion
(predictably), so it was reverted. We will replace it by
compaction-based backpressure.

## Problem

In #8550, we made the flush loop wait for uploads after every layer.
This was to avoid unbounded buildup of uploads, and to reduce compaction
debt. However, the approach has several problems:

* It prevents upload parallelism.
* It prevents flush and upload pipelining.
* It slows down ingestion even when there is no need to backpressure.
* It does not directly backpressure based on compaction debt and read
amplification.

We will instead implement compaction-based backpressure in a PR
immediately following this removal (#5415).

Touches #5415.
Touches #10095.

## Summary of changes

Remove waiting on the upload queue in the flush loop.
---
 pageserver/src/metrics.rs                  | 25 +----------
 pageserver/src/tenant/timeline.rs          | 38 ++++-------------
 test_runner/fixtures/metrics.py            |  1 -
 test_runner/regress/test_branching.py      | 13 ++----
 test_runner/regress/test_remote_storage.py | 48 ----------------------
 5 files changed, 13 insertions(+), 112 deletions(-)
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 02467cb6f7..985614b6cf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -395,15 +395,6 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_flush_wait_upload_seconds",
-        "Time spent waiting for preceding uploads during layer flush",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2575,7 +2566,6 @@ pub(crate) struct TimelineMetrics {
     shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
-    pub flush_wait_upload_time_gauge: Gauge,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2621,9 +2611,6 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2769,7 +2756,6 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
             flush_time_histo,
-            flush_wait_upload_time_gauge,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
@@ -2819,14 +2805,6 @@ impl TimelineMetrics {
         self.resident_physical_size_gauge.get()
     }
 
-    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
-        self.flush_wait_upload_time_gauge.add(duration);
-        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
-            .unwrap()
-            .add(duration);
-    }
-
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2844,7 +2822,6 @@ impl TimelineMetrics {
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5d348ac474..56f61abc45 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -150,19 +150,15 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
+use super::config::TenantConf;
+use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::RemoteTimelineClient;
+use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
+use super::upload_queue::NotInitialized;
+use super::GcError;
 use super::{
-    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
-    MaybeOffloaded,
-};
-use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
-use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{
-    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
-    storage_layer::ReadableLayer,
-};
-use super::{
-    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
-    GcError,
+    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, MaybeOffloaded,
 };
 
 #[cfg(test)]
@@ -3886,24 +3882,6 @@ impl Timeline {
             // release lock on 'layers'
         };
 
-        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote
-        let start = Instant::now();
-        self.remote_client
-            .wait_completion()
-            .await
-            .map_err(|e| match e {
-                WaitCompletionError::UploadQueueShutDownOrStopped
-                | WaitCompletionError::NotInitialized(
-                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                ) => FlushLayerError::Cancelled,
-                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                    FlushLayerError::Other(anyhow!(e).into())
-                }
-            })?;
-        let duration = start.elapsed().as_secs_f64();
-        self.metrics.flush_wait_upload_time_gauge_add(duration);
-
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd7e193778..37859901d4 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -165,7 +165,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
-    "pageserver_flush_wait_upload_seconds",
     counter("pageserver_tenant_throttling_count_accounted_start"),
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 34e4e994cb..a4056404f0 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
+from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -176,11 +177,8 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
-            env.endpoints.create_start(
-                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
-            )
-        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
+        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
+            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
     finally:
         env.pageserver.stop(immediate=True)
 
@@ -221,10 +219,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(
-            PageserverApiException,
-            match="Cannot branch off the timeline that's not present in pageserver",
-        ):
+        with pytest.raises(RetryError, match="too many 503 error responses"):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 76a42ef4a2..52b6b254aa 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -784,54 +784,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
-def test_paused_upload_stalls_checkpoint(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    This test checks that checkpoints block on uploads to remote storage.
-    """
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            # Set a small compaction threshold
-            "compaction_threshold": "3",
-            # Disable GC
-            "gc_period": "0s",
-            # disable PITR
-            "pitr_interval": "0s",
-        }
-    )
-
-    env.pageserver.allowed_errors.append(
-        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
-    )
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    client = env.pageserver.http_client()
-    layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
-    deltas_at_creation = len(layers_at_creation.delta_layers())
-    assert (
-        deltas_at_creation == 1
-    ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
-
-    # Make new layer uploads get stuck.
-    # Note that timeline creation waits for the initial layers to reach remote storage.
-    # So at this point, the `layers_at_creation` are in remote storage.
-    client.configure_failpoints(("before-upload-layer-pausable", "pause"))
-
-    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        # Build two tables with some data inside
-        endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-        with pytest.raises(ReadTimeout):
-            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
-        client.configure_failpoints(("before-upload-layer-pausable", "off"))
-
-
 def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):

From ddb9ae1214a1ab19300514e5487569471b8a551a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 24 Jan 2025 10:47:28 +0100
Subject: [PATCH 02/72] pageserver: add compaction backpressure for layer
 flushes (#10405)

## Problem

There is no direct backpressure for compaction and L0 read
amplification. This allows a large buildup of compaction debt and read
amplification.

Resolves #5415.
Requires #10402.

## Summary of changes

Delay layer flushes based on the number of level 0 delta layers:

* `l0_flush_delay_threshold`: delay flushes such that they take 2x as
long (default `2 * compaction_threshold`).
* `l0_flush_stall_threshold`: stall flushes until level 0 delta layers
drop below threshold (default `4 * compaction_threshold`).

If either threshold is reached, ephemeral layer rolls also synchronously
wait for layer flushes to propagate this backpressure up into WAL
ingestion. This will bound the number of frozen layers to 1 once
backpressure kicks in, since all other frozen layers must flush before
the rolled layer.

## Analysis

This will significantly change the compute backpressure characteristics.
Recall the three compute backpressure knobs:

* `max_replication_write_lag`: 500 MB (based on Pageserver
`last_received_lsn`).
* `max_replication_flush_lag`: 10 GB (based on Pageserver
`disk_consistent_lsn`).
* `max_replication_apply_lag`: disabled (based on Pageserver
`remote_consistent_lsn`).

Previously, the Pageserver would keep ingesting WAL and build up
ephemeral layers and L0 layers until the compute hit
`max_replication_flush_lag` at 10 GB and began backpressuring. Now, once
we delay/stall WAL ingestion, the compute will begin backpressuring
after `max_replication_write_lag`, i.e. 500 MB. This is probably a good
thing (we're not building up a ton of compaction debt), but we should
consider tuning these settings.

`max_replication_flush_lag` probably doesn't serve a purpose anymore,
and we should consider removing it.

Furthermore, the removal of the upload barrier in #10402 will mean that
we no longer backpressure flushes based on S3 uploads, since
`max_replication_apply_lag` is disabled. We should consider enabling
this as well.

### When and what do we compact?

Default compaction settings:

* `compaction_threshold`: 10 L0 delta layers.
* `compaction_period`: 20 seconds (between each compaction loop check).
* `checkpoint_distance`: 256 MB (size of L0 delta layers).
* `l0_flush_delay_threshold`: 20 L0 delta layers.
* `l0_flush_stall_threshold`: 40 L0 delta layers.

Compaction characteristics:

* Minimum compaction volume: 10 layers * 256 MB = 2.5 GB.
* Additional compaction volume (assuming 128 MB/s WAL): 128 MB/s * 20
seconds = 2.5 GB (10 L0 layers).
* Required compaction bandwidth: 5.0 GB / 20 seconds = 256 MB/s.

### When do we hit `max_replication_write_lag`?

Depending on how fast compaction and flushes happens, the compute will
backpressure somewhere between `l0_flush_delay_threshold` or
`l0_flush_stall_threshold` + `max_replication_write_lag`.

* Minimum compute backpressure lag: 20 layers * 256 MB + 500 MB = 5.6 GB
* Maximum compute backpressure lag: 40 layers * 256 MB + 500 MB = 10.0
GB

This seems like a reasonable range to me.
---
 control_plane/src/pageserver.rs               |  10 +
 libs/pageserver_api/src/config.rs             |  13 +-
 libs/pageserver_api/src/models.rs             |  16 ++
 pageserver/src/metrics.rs                     |  12 +-
 pageserver/src/tenant.rs                      |   2 +
 pageserver/src/tenant/config.rs               |  26 ++
 pageserver/src/tenant/timeline.rs             | 257 +++++++++++++++---
 .../fixtures/pageserver/allowed_errors.py     |   7 +-
 test_runner/performance/test_layer_map.py     |   2 +
 .../regress/test_attach_tenant_config.py      |   2 +
 test_runner/regress/test_branch_and_gc.py     |   2 +
 test_runner/regress/test_compatibility.py     |   2 +-
 test_runner/regress/test_recovery.py          |   5 +-
 test_runner/regress/test_remote_storage.py    |   2 +
 test_runner/regress/test_timeline_size.py     |   2 +-
 test_runner/regress/test_vm_bits.py           |   3 +
 16 files changed, 311 insertions(+), 52 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b33b2877b3..967810ee06 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -352,6 +352,16 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("Failed to parse 'compaction_algorithm' json")?,
+            l0_flush_delay_threshold: settings
+                .remove("l0_flush_delay_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
+            l0_flush_stall_threshold: settings
+                .remove("l0_flush_stall_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_stall_threshold' as an integer")?,
             gc_horizon: settings
                 .remove("gc_horizon")
                 .map(|x| x.parse::<u64>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 4982c6233d..5866145690 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -254,9 +254,18 @@ pub struct TenantConfigToml {
     // Duration::ZERO means automatic compaction is disabled.
     #[serde(with = "humantime_serde")]
     pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
+    /// Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
     pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
+    /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
+    /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
+    /// blowing up. Should be >compaction_threshold. If None, defaults to 2 * compaction_threshold.
+    /// 0 to disable.
+    pub l0_flush_delay_threshold: Option<usize>,
+    /// Level0 delta layer threshold at which to stall layer flushes. 0 to disable. If None,
+    /// defaults to 4 * compaction_threshold. Must be >compaction_threshold to avoid deadlock.
+    pub l0_flush_stall_threshold: Option<usize>,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -552,6 +561,8 @@ impl Default for TenantConfigToml {
             compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                 kind: DEFAULT_COMPACTION_ALGORITHM,
             },
+            l0_flush_delay_threshold: None,
+            l0_flush_stall_threshold: None,
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index fd4879087f..16473415b4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -462,6 +462,10 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_delay_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_stall_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_horizon: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_period: FieldPatch<String>,
@@ -518,6 +522,8 @@ pub struct TenantConfig {
     pub compaction_threshold: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub l0_flush_delay_threshold: Option<usize>,
+    pub l0_flush_stall_threshold: Option<usize>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -551,6 +557,8 @@ impl TenantConfig {
             mut compaction_period,
             mut compaction_threshold,
             mut compaction_algorithm,
+            mut l0_flush_delay_threshold,
+            mut l0_flush_stall_threshold,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -583,6 +591,12 @@ impl TenantConfig {
         patch.compaction_period.apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch
+            .l0_flush_delay_threshold
+            .apply(&mut l0_flush_delay_threshold);
+        patch
+            .l0_flush_stall_threshold
+            .apply(&mut l0_flush_stall_threshold);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch.gc_period.apply(&mut gc_period);
         patch
@@ -635,6 +649,8 @@ impl TenantConfig {
             compaction_period,
             compaction_threshold,
             compaction_algorithm,
+            l0_flush_delay_threshold,
+            l0_flush_stall_threshold,
             gc_horizon,
             gc_period,
             image_creation_threshold,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 985614b6cf..5247a4a2ac 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -38,6 +38,9 @@ pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]
     LayerFlush,
 
+    #[strum(serialize = "layer flush delay")]
+    LayerFlushDelay,
+
     #[strum(serialize = "compact")]
     Compact,
 
@@ -2508,7 +2511,6 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
 
 impl AlwaysRecordingStorageTimeMetricsTimer {
     /// Returns the elapsed duration of the timer.
-    #[allow(unused)]
     pub fn elapsed(&self) -> Duration {
         self.0.as_ref().expect("not dropped yet").elapsed()
     }
@@ -2566,6 +2568,7 @@ pub(crate) struct TimelineMetrics {
     shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
+    pub flush_delay_histo: StorageTimeMetrics,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2611,6 +2614,12 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let flush_delay_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LayerFlushDelay,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2756,6 +2765,7 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
             flush_time_histo,
+            flush_delay_histo,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a273ef5d01..efe89cb982 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5453,6 +5453,8 @@ pub(crate) mod harness {
                 compaction_period: Some(tenant_conf.compaction_period),
                 compaction_threshold: Some(tenant_conf.compaction_threshold),
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
+                l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
+                l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
                 gc_horizon: Some(tenant_conf.gc_horizon),
                 gc_period: Some(tenant_conf.gc_period),
                 image_creation_threshold: Some(tenant_conf.image_creation_threshold),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 3db1445f6e..c870ca97b8 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -281,6 +281,14 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub l0_flush_delay_threshold: Option<usize>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub l0_flush_stall_threshold: Option<usize>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub gc_horizon: Option<u64>,
@@ -394,6 +402,12 @@ impl TenantConfOpt {
                 .as_ref()
                 .unwrap_or(&global_conf.compaction_algorithm)
                 .clone(),
+            l0_flush_delay_threshold: self
+                .l0_flush_delay_threshold
+                .or(global_conf.l0_flush_delay_threshold),
+            l0_flush_stall_threshold: self
+                .l0_flush_stall_threshold
+                .or(global_conf.l0_flush_stall_threshold),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -458,6 +472,8 @@ impl TenantConfOpt {
             mut compaction_period,
             mut compaction_threshold,
             mut compaction_algorithm,
+            mut l0_flush_delay_threshold,
+            mut l0_flush_stall_threshold,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -496,6 +512,12 @@ impl TenantConfOpt {
             .apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch
+            .l0_flush_delay_threshold
+            .apply(&mut l0_flush_delay_threshold);
+        patch
+            .l0_flush_stall_threshold
+            .apply(&mut l0_flush_stall_threshold);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch
             .gc_period
@@ -566,6 +588,8 @@ impl TenantConfOpt {
             compaction_period,
             compaction_threshold,
             compaction_algorithm,
+            l0_flush_delay_threshold,
+            l0_flush_stall_threshold,
             gc_horizon,
             gc_period,
             image_creation_threshold,
@@ -623,6 +647,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_target_size: value.compaction_target_size,
             compaction_period: value.compaction_period.map(humantime),
             compaction_threshold: value.compaction_threshold,
+            l0_flush_delay_threshold: value.l0_flush_delay_threshold,
+            l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             gc_horizon: value.gc_horizon,
             gc_period: value.gc_period.map(humantime),
             image_creation_threshold: value.image_creation_threshold,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 56f61abc45..fffa2c8e2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,11 +22,11 @@ use enumset::EnumSet;
 use fail::fail_point;
 use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
+use layer_manager::Shutdown;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::models::PageTraceEvent;
 use pageserver_api::{
-    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
         KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
         SPARSE_RANGE,
@@ -60,20 +60,14 @@ use utils::{
 };
 use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
-use std::sync::atomic::Ordering as AtomicOrdering;
-use std::sync::OnceLock;
-use std::sync::{Arc, Mutex, RwLock, Weak};
+use std::array;
+use std::cmp::{max, min};
+use std::collections::btree_map::Entry;
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::ops::{ControlFlow, Deref, Range};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
+use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
-use std::{
-    array,
-    collections::{BTreeMap, HashMap, HashSet},
-    sync::atomic::AtomicU64,
-};
-use std::{cmp::min, ops::ControlFlow};
-use std::{
-    collections::btree_map::Entry,
-    ops::{Deref, Range},
-};
 
 use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::{
@@ -404,6 +398,9 @@ pub struct Timeline {
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     compaction_lock: tokio::sync::Mutex<()>,
 
+    /// If true, the last compaction failed.
+    compaction_failed: AtomicBool,
+
     /// Make sure we only have one running gc at a time.
     ///
     /// Must only be taken in two places:
@@ -1698,13 +1695,25 @@ impl Timeline {
             return Ok(false);
         }
 
-        match self.get_compaction_algorithm_settings().kind {
+        let result = match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => {
                 self.compact_tiered(cancel, ctx).await?;
                 Ok(false)
             }
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
-        }
+        };
+
+        // Signal compaction failure to avoid L0 flush stalls when it's broken.
+        let compaction_failed = match result {
+            Ok(_) => false,
+            Err(CompactionError::Offload(_)) => false, // doesn't halt compaction
+            Err(CompactionError::ShuttingDown) => false, // not a failure
+            Err(CompactionError::Other(_)) => true,
+        };
+        self.compaction_failed
+            .store(compaction_failed, AtomicOrdering::Relaxed);
+
+        result
     }
 
     /// Mutate the timeline with a [`TimelineWriter`].
@@ -2133,6 +2142,13 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
+    fn get_compaction_period(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_period
+            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
+    }
+
     fn get_compaction_target_size(&self) -> u64 {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2149,6 +2165,84 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
+        // Default to delay L0 flushes at 2x compaction threshold.
+        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 2;
+
+        // If compaction is disabled, don't delay.
+        if self.get_compaction_period() == Duration::ZERO {
+            return None;
+        }
+
+        let compaction_threshold = self.get_compaction_threshold();
+        let tenant_conf = self.tenant_conf.load();
+        let l0_flush_delay_threshold = tenant_conf
+            .tenant_conf
+            .l0_flush_delay_threshold
+            .or(self.conf.default_tenant_conf.l0_flush_delay_threshold)
+            .unwrap_or(DEFAULT_L0_FLUSH_DELAY_FACTOR * compaction_threshold);
+
+        // 0 disables backpressure.
+        if l0_flush_delay_threshold == 0 {
+            return None;
+        }
+
+        // Clamp the flush delay threshold to the compaction threshold; it doesn't make sense to
+        // backpressure flushes below this.
+        // TODO: the tenant config should have validation to prevent this instead.
+        debug_assert!(l0_flush_delay_threshold >= compaction_threshold);
+        Some(max(l0_flush_delay_threshold, compaction_threshold))
+    }
+
+    fn get_l0_flush_stall_threshold(&self) -> Option<usize> {
+        // Default to stall L0 flushes at 4x compaction threshold.
+        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 4;
+
+        // If compaction is disabled, don't stall.
+        if self.get_compaction_period() == Duration::ZERO {
+            return None;
+        }
+
+        // If compaction is failing, don't stall and try to keep the tenant alive. This may not be a
+        // good idea: read amp can grow unbounded, leading to terrible performance, and we may take
+        // on unbounded compaction debt that can take a long time to fix once compaction comes back
+        // online. At least we'll delay flushes, slowing down the growth and buying some time.
+        if self.compaction_failed.load(AtomicOrdering::Relaxed) {
+            return None;
+        }
+
+        let compaction_threshold = self.get_compaction_threshold();
+        let tenant_conf = self.tenant_conf.load();
+        let l0_flush_stall_threshold = tenant_conf
+            .tenant_conf
+            .l0_flush_stall_threshold
+            .or(self.conf.default_tenant_conf.l0_flush_stall_threshold);
+
+        // Tests sometimes set compaction_threshold=1 to generate lots of layer files, and don't
+        // handle the 20-second compaction delay. Some (e.g. `test_backward_compatibility`) can't
+        // easily adjust the L0 backpressure settings, so just disable stalls in this case.
+        if cfg!(feature = "testing")
+            && compaction_threshold == 1
+            && l0_flush_stall_threshold.is_none()
+        {
+            return None;
+        }
+
+        let l0_flush_stall_threshold = l0_flush_stall_threshold
+            .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
+
+        // 0 disables backpressure.
+        if l0_flush_stall_threshold == 0 {
+            return None;
+        }
+
+        // Clamp the flush stall threshold to the compaction threshold; it doesn't make sense to
+        // backpressure flushes below this.
+        // TODO: the tenant config should have validation to prevent this instead.
+        debug_assert!(l0_flush_stall_threshold >= compaction_threshold);
+        Some(max(l0_flush_stall_threshold, compaction_threshold))
+    }
+
     fn get_image_creation_threshold(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2385,6 +2479,7 @@ impl Timeline {
                 gate: Gate::default(),
 
                 compaction_lock: tokio::sync::Mutex::default(),
+                compaction_failed: AtomicBool::default(),
                 gc_lock: tokio::sync::Mutex::default(),
 
                 standby_horizon: AtomicLsn::new(0),
@@ -3600,6 +3695,12 @@ impl Timeline {
         mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
         ctx: &RequestContext,
     ) {
+        // Subscribe to L0 delta layer updates, for compaction backpressure.
+        let mut watch_l0 = match self.layers.read().await.layer_map() {
+            Ok(lm) => lm.watch_level0_deltas(),
+            Err(Shutdown) => return,
+        };
+
         info!("started flush loop");
         loop {
             tokio::select! {
@@ -3630,43 +3731,62 @@ impl Timeline {
                     break Ok(());
                 }
 
-                let timer = self.metrics.flush_time_histo.start_timer();
-
-                let num_frozen_layers;
-                let frozen_layer_total_size;
-                let layer_to_flush = {
-                    let guard = self.layers.read().await;
-                    let Ok(lm) = guard.layer_map() else {
+                // Fetch the next layer to flush, if any.
+                let (layer, l0_count, frozen_count, frozen_size) = {
+                    let layers = self.layers.read().await;
+                    let Ok(lm) = layers.layer_map() else {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
                     };
-                    num_frozen_layers = lm.frozen_layers.len();
-                    frozen_layer_total_size = lm
+                    let l0_count = lm.level0_deltas().len();
+                    let frozen_count = lm.frozen_layers.len();
+                    let frozen_size: u64 = lm
                         .frozen_layers
                         .iter()
                         .map(|l| l.estimated_in_mem_size())
-                        .sum::<u64>();
-                    lm.frozen_layers.front().cloned()
-                    // drop 'layers' lock to allow concurrent reads and writes
+                        .sum();
+                    let layer = lm.frozen_layers.front().cloned();
+                    (layer, l0_count, frozen_count, frozen_size)
+                    // drop 'layers' lock
                 };
-                let Some(layer_to_flush) = layer_to_flush else {
+                let Some(layer) = layer else {
                     break Ok(());
                 };
-                if num_frozen_layers
-                    > std::cmp::max(
-                        self.get_compaction_threshold(),
-                        DEFAULT_COMPACTION_THRESHOLD,
-                    )
-                    && frozen_layer_total_size >= /* 128 MB */ 128000000
-                {
-                    tracing::warn!(
-                        "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
-                    );
-                }
-                match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(this_layer_to_lsn) => {
-                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
+
+                // Stall flushes to backpressure if compaction can't keep up. This is propagated up
+                // to WAL ingestion by having ephemeral layer rolls wait for flushes.
+                //
+                // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so
+                // we can end up stalling before compaction even starts. Consider making it more
+                // responsive (e.g. via `watch_level0_deltas`).
+                if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() {
+                    if l0_count >= stall_threshold {
+                        warn!(
+                            "stalling layer flushes for compaction backpressure at {l0_count} \
+                            L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
+                        );
+                        let stall_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
+                        tokio::select! {
+                            result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
+                                if let Ok(l0) = result.as_deref() {
+                                    let delay = stall_timer.elapsed().as_secs_f64();
+                                    info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s");
+                                }
+                            },
+                            _ = self.cancel.cancelled() => {},
+                        }
+                        continue; // check again
                     }
+                }
+
+                // Flush the layer.
+                let flush_timer = self.metrics.flush_time_histo.start_timer();
+                match self.flush_frozen_layer(layer, ctx).await {
+                    Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn),
                     Err(FlushLayerError::Cancelled) => {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
@@ -3680,7 +3800,30 @@ impl Timeline {
                         break err.map(|_| ());
                     }
                 }
-                timer.stop_and_record();
+                let flush_duration = flush_timer.stop_and_record();
+
+                // Delay the next flush to backpressure if compaction can't keep up. We delay by the
+                // flush duration such that the flush takes 2x as long. This is propagated up to WAL
+                // ingestion by having ephemeral layer rolls wait for flushes.
+                if let Some(delay_threshold) = self.get_l0_flush_delay_threshold() {
+                    if l0_count >= delay_threshold {
+                        let delay = flush_duration.as_secs_f64();
+                        info!(
+                            "delaying layer flush by {delay:.3}s for compaction backpressure at \
+                            {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
+                        );
+                        let _delay_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
+                        tokio::select! {
+                            _ = tokio::time::sleep(flush_duration) => {},
+                            _ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {},
+                            _ = self.cancel.cancelled() => {},
+                        }
+                    }
+                }
             };
 
             // Unsharded tenants should never advance their LSN beyond the end of the
@@ -5910,13 +6053,37 @@ impl TimelineWriter<'_> {
     async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> {
         let current_size = self.write_guard.as_ref().unwrap().current_size;
 
+        // If layer flushes are backpressured due to compaction not keeping up, wait for the flush
+        // to propagate the backpressure up into WAL ingestion.
+        let l0_count = self
+            .tl
+            .layers
+            .read()
+            .await
+            .layer_map()?
+            .level0_deltas()
+            .len();
+        let wait_thresholds = [
+            self.get_l0_flush_delay_threshold(),
+            self.get_l0_flush_stall_threshold(),
+        ];
+        let wait_threshold = wait_thresholds.into_iter().flatten().min();
+
         // self.write_guard will be taken by the freezing
-        self.tl
+        let flush_id = self
+            .tl
             .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
             .await?;
 
         assert!(self.write_guard.is_none());
 
+        if let Some(wait_threshold) = wait_threshold {
+            if l0_count >= wait_threshold {
+                info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
+                self.tl.wait_flush_completion(flush_id).await?;
+            }
+        }
+
         if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
         }
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 5059039678..748ac0d569 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -99,8 +99,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
     # Can happen during shutdown
     ".*scheduling deletion on drop failed: queue is in state Stopped.*",
-    # Too many frozen layers error is normal during intensive benchmarks
-    ".*too many frozen layers.*",
+    # L0 flush backpressure delays are expected under heavy ingest load. We want to exercise
+    # this backpressure in tests.
+    ".*delaying layer flush by \\S+ for compaction backpressure.*",
+    ".*stalling layer flushes for compaction backpressure.*",
+    ".*layer roll waiting for flush due to compaction backpressure.*",
 )
 
 
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 8a4ad2d399..9b159c5fcf 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -23,6 +23,8 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
             "checkpoint_distance": "16384",
             "compaction_period": "1 s",
             "compaction_threshold": "1",
+            "l0_flush_delay_threshold": "0",
+            "l0_flush_stall_threshold": "0",
             "compaction_target_size": "16384",
         }
     )
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index b8d47346a3..1fdba223ad 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -139,6 +139,8 @@ def test_fully_custom_config(positive_env: NeonEnv):
     fully_custom_config = {
         "compaction_period": "1h",
         "compaction_threshold": 13,
+        "l0_flush_delay_threshold": 25,
+        "l0_flush_stall_threshold": 42,
         "compaction_target_size": 1048576,
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index fccfbc7f09..0e28231a86 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -64,6 +64,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
             # tweak the default settings to allow quickly create image layers and L1 layers
             "compaction_period": "1 s",
             "compaction_threshold": "2",
+            "l0_flush_delay_threshold": "20",
+            "l0_flush_stall_threshold": "40",
             "image_creation_threshold": "1",
             # Disable PITR, this test will set an explicit space-based GC limit
             "pitr_interval": "0 s",
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ac44630d30..cdc6c0053d 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -143,7 +143,7 @@ def test_create_snapshot(
 
     env = neon_env_builder.init_start(
         initial_tenant_conf={
-            # Miniature layers to enable generating non-trivial layer map without writing lots of data
+            # Miniature layers to enable generating non-trivial layer map without writing lots of data.
             "checkpoint_distance": f"{128 * 1024}",
             "compaction_threshold": "1",
             "compaction_target_size": f"{128 * 1024}",
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index b43a443149..dab01fcd1a 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -11,10 +11,13 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 # Test pageserver recovery after crash
 #
 def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
-    # Override default checkpointer settings to run it more often
+    # Override default checkpointer settings to run it more often.
+    # This also creates a bunch more L0 layers, so disable backpressure.
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             "checkpoint_distance": "1048576",
+            "l0_flush_delay_threshold": "0",
+            "l0_flush_stall_threshold": "0",
         }
     )
     env.pageserver.is_testing_enabled_or_skip()
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 52b6b254aa..f6bc6f6f41 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -539,6 +539,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             # small checkpointing and compaction targets to ensure we generate many operations
             "checkpoint_distance": f"{64 * 1024}",
             "compaction_threshold": "1",
+            "l0_flush_delay_threshold": "0",
+            "l0_flush_stall_threshold": "0",
             "compaction_target_size": f"{64 * 1024}",
             # large horizon to avoid automatic GC (our assert on gc_result below relies on that)
             "gc_horizon": f"{1024 ** 4}",
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 95bf9106cd..e2fdacdbfc 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -440,7 +440,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             "checkpoint_distance": "100000",
-            "compaction_period": "10m",
+            "compaction_period": "0s",
         }
     )
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index d9e59c71f4..4865178ca8 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -203,6 +203,9 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
         "checkpoint_distance": f"{128 * 1024}",
         "compaction_target_size": f"{128 * 1024}",
         "compaction_threshold": "1",
+        # disable L0 backpressure
+        "l0_flush_delay_threshold": "0",
+        "l0_flush_stall_threshold": "0",
         # create image layers eagerly, so that GC can remove some layers
         "image_creation_threshold": "1",
         # set PITR interval to be small, so we can do GC

From de8276488d10f386166b7063e1d894cbfe8a450f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 24 Jan 2025 10:34:57 +0000
Subject: [PATCH 03/72] tests: enable wal reader fanout in tests (#10301)

Note: this has to merge after the release is cut on `2025-01-17` for
compat tests to start passing.

## Problem

SK wal reader fan-out is not enabled in tests by default.

## Summary of changes

Enable it.
---
 test_runner/fixtures/neon_fixtures.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 388c1eb046..7e3cc19829 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4390,6 +4390,7 @@ class Safekeeper(LogUtils):
                 "1s",
                 "--eviction-min-resident",
                 "10s",
+                "--wal-reader-fanout",
             ]
 
         self.extra_opts = extra_opts

From c286fea01896aabc0761b5d880246e93fe60ed4d Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Fri, 24 Jan 2025 11:44:48 +0100
Subject: [PATCH 04/72] Print logs in extensions test in another step to
 improve readability (#10483)

## Problem
The containers' log output is mixed with the tests' output, so you must
scroll up to find the error.
## Summary of changes
Printing of containers' logs moved to a separate step.
---
 .github/workflows/build_and_test.yml  | 4 ++--
 docker-compose/docker-compose.yml     | 4 ++--
 docker-compose/docker_compose_test.sh | 8 +-------
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b1230879d3..81127f7870 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -820,8 +820,8 @@ jobs:
       - name: Print logs and clean up
         if: always()
         run: |
-          docker compose -f ./docker-compose/docker-compose.yml logs || 0
-          docker compose -f ./docker-compose/docker-compose.yml down
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
 
   promote-images-dev:
     needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 4f0a887c27..95e4b6fde7 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -150,8 +150,8 @@ services:
         - REPOSITORY=${REPOSITORY:-neondatabase}
         - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
         - TAG=${TAG:-latest}
-        - http_proxy=$http_proxy
-        - https_proxy=$https_proxy
+        - http_proxy=${http_proxy:-}
+        - https_proxy=${https_proxy:-}
     environment:
       - PG_VERSION=${PG_VERSION:-16}
       #- RUST_BACKTRACE=1
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 4f1ae64873..f42aca673b 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -22,7 +22,6 @@ PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
 cleanup() {
     echo "show container information"
     docker ps
-    docker compose --profile test-extensions -f $COMPOSE_FILE logs
     echo "stop containers..."
     docker compose --profile test-extensions -f $COMPOSE_FILE down
 }
@@ -41,7 +40,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         cnt=`expr $cnt + 3`
         if [ $cnt -gt 60 ]; then
             echo "timeout before the compute is ready."
-            cleanup
             exit 1
         fi
         if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
@@ -63,11 +61,9 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
         # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
             $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
         then
-            cleanup
-        else
             FAILED=$(tail -1 testout.txt)
             for d in $FAILED
             do
@@ -77,9 +73,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
                 cat $d/regression.out $d/regression.diffs || true
             done
         rm -rf $FAILED
-        cleanup
         exit 1
         fi
     fi
-    cleanup
 done

From dcc437da1dc2225c3d724bbeb44b53ebeab84089 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 24 Jan 2025 12:03:39 +0100
Subject: [PATCH 05/72] Make promote-images-prod depend on promote-images-dev
 (#10494)

## Problem
After talking about it again with @bayandin again this should replace
the changes from https://github.com/neondatabase/neon/pull/10475. While
the previous changes worked, they are less visually clear in what
happens, and we might end up in a situation where we update `latest`,
but don't actually have the tagged image pushed that contains the same
changes. The latter would result in potentially hard to debug
situations.

## Summary of changes
Revert c283aaaf8d66dd04ce463733cf6545269f70f4c9 and make
promote-images-prod depend on promote-images-dev instead.
---
 .github/workflows/build_and_test.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 81127f7870..32b99d9c38 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -859,7 +859,7 @@ jobs:
           done
 
   promote-images-prod:
-    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
+    needs: [ check-permissions, tag, test-images, promote-images-dev ]
     runs-on: ubuntu-22.04
     if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
 
@@ -892,14 +892,14 @@ jobs:
         run: |
           for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
             docker buildx imagetools create -t $repo/neon:latest \
-                                               neondatabase/neon:${{ needs.tag.outputs.build-tag }}
+                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
 
             for version in ${VERSIONS}; do
               docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-                                                 neondatabase/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
 
               docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-                                                 neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
             done
           done
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \

From d8ab6ddb0fee7912eafa7012435bcdd014543663 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 24 Jan 2025 14:43:52 +0200
Subject: [PATCH 06/72] Check if relation has storage in
 calculate_relation_size (#10477)

## Problem

Parent of partitioned table has no storage, it relfilelocator is zero.
It cab be incorrectly hashed and produce wrong results.

See https://github.com/neondatabase/postgres/pull/518

## Summary of changes

This problem is already addressed in pg17.
Add the same check for all other PG versions.

Postgres PRs:
https://github.com/neondatabase/postgres/pull/566
https://github.com/neondatabase/postgres/pull/565
https://github.com/neondatabase/postgres/pull/564

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 5f3b3afdd7..c0aedfd3ca 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633
+Subproject commit c0aedfd3cac447510a2db843b561f0c52901b679
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 935292e883..355a7c69d3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 935292e883298187f112db6e9c7f765037ddcf64
+Subproject commit 355a7c69d3f907f3612eb406cc7b9c2f55d59b59
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 061d563779..3cf7ce1afa 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 061d56377961ba56998e41b7d5d5e975919ad301
+Subproject commit 3cf7ce1afab75027716d14223f95ddb300754162
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a104be8ae0..dba0e67fb4 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -5,14 +5,14 @@
   ],
   "v16": [
     "16.6",
-    "061d56377961ba56998e41b7d5d5e975919ad301"
+    "3cf7ce1afab75027716d14223f95ddb300754162"
   ],
   "v15": [
     "15.10",
-    "935292e883298187f112db6e9c7f765037ddcf64"
+    "355a7c69d3f907f3612eb406cc7b9c2f55d59b59"
   ],
   "v14": [
     "14.15",
-    "5f3b3afdd7c24b4a0fd63ecb3288fab472fcc633"
+    "c0aedfd3cac447510a2db843b561f0c52901b679"
   ]
 }

From ef2a2555b1c001ee6af9bf4bc9896aa6bd09b9ed Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 24 Jan 2025 14:55:05 +0100
Subject: [PATCH 07/72] pageserver: tighten compaction failure detection
 (#10502)

## Problem

If compaction fails, we disable L0 flush stalls to avoid persistent
stalls. However, the logic would unset the failure marker on offload
failures or shutdown. This can lead to sudden L0 flush stalls if we try
and fail to offload a timeline with compaction failures, or if there is
some kind of shutdown race.

Touches #10405.

## Summary of changes

Don't touch the compaction failure marker on offload failures or
shutdown.
---
 pageserver/src/tenant/timeline.rs | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fffa2c8e2b..ee43512501 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1704,14 +1704,16 @@ impl Timeline {
         };
 
         // Signal compaction failure to avoid L0 flush stalls when it's broken.
-        let compaction_failed = match result {
-            Ok(_) => false,
-            Err(CompactionError::Offload(_)) => false, // doesn't halt compaction
-            Err(CompactionError::ShuttingDown) => false, // not a failure
-            Err(CompactionError::Other(_)) => true,
+        match result {
+            Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
+            Err(CompactionError::Other(_)) => {
+                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
+            }
+            // Don't change the current value on offload failure or shutdown. We don't want to
+            // abruptly stall nor resume L0 flushes in these cases.
+            Err(CompactionError::Offload(_)) => {}
+            Err(CompactionError::ShuttingDown) => {}
         };
-        self.compaction_failed
-            .store(compaction_failed, AtomicOrdering::Relaxed);
 
         result
     }

From 7000aaaf75ddb4451236a9dd41e71247864c2095 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 24 Jan 2025 14:55:48 +0000
Subject: [PATCH 08/72] chore: fix h2 stubgen (#10491)

## Problem

## Summary of changes

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 poetry.lock                           | 190 +++++++++++++++++---
 pyproject.toml                        |   3 +-
 test_runner/stubs/h2/__init__.pyi     |   1 +
 test_runner/stubs/h2/config.pyi       |  25 +--
 test_runner/stubs/h2/connection.pyi   | 162 +++++++----------
 test_runner/stubs/h2/errors.pyi       |  30 ++--
 test_runner/stubs/h2/events.pyi       |   8 +-
 test_runner/stubs/h2/exceptions.pyi   |  11 +-
 test_runner/stubs/h2/frame_buffer.pyi |  19 +-
 test_runner/stubs/h2/settings.pyi     |  70 ++++----
 test_runner/stubs/h2/stream.pyi       | 240 ++++++++++----------------
 test_runner/stubs/h2/utilities.pyi    |  41 +++--
 test_runner/stubs/h2/windows.pyi      |  11 +-
 13 files changed, 426 insertions(+), 385 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 2cd2bc6383..c471d3e69c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -6,6 +6,7 @@ version = "2.3.5"
 description = "Happy Eyeballs for asyncio"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
     {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
@@ -17,6 +18,7 @@ version = "3.10.11"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"},
     {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"},
@@ -128,6 +130,7 @@ version = "1.4.0"
 description = "Postgres integration with asyncio."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
     {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
@@ -146,6 +149,7 @@ version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
     {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
@@ -160,6 +164,7 @@ version = "2.13.2"
 description = "Allure pytest integration"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"},
     {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"},
@@ -175,6 +180,7 @@ version = "2.13.2"
 description = "Common module for integrate allure with python-based frameworks"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"},
     {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"},
@@ -190,6 +196,7 @@ version = "0.6.0"
 description = "Reusable constraint types to use with typing.Annotated"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
     {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
@@ -201,6 +208,7 @@ version = "4.13.1"
 description = "ANTLR 4.13.1 runtime for Python 3"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"},
     {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"},
@@ -212,6 +220,7 @@ version = "4.3.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
     {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
@@ -232,6 +241,7 @@ version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
     {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
@@ -243,6 +253,7 @@ version = "0.30.0"
 description = "An asyncio PostgreSQL driver"
 optional = false
 python-versions = ">=3.8.0"
+groups = ["main"]
 files = [
     {file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"},
     {file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"},
@@ -306,6 +317,7 @@ version = "21.4.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
     {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
@@ -323,6 +335,7 @@ version = "1.88.0"
 description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates"
 optional = false
 python-versions = "!=4.0,<=4.0,>=3.8"
+groups = ["main"]
 files = [
     {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"},
     {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"},
@@ -343,6 +356,7 @@ version = "2.10.0"
 description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service."
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "aws-xray-sdk-2.10.0.tar.gz", hash = "sha256:9b14924fd0628cf92936055864655354003f0b1acc3e1c3ffde6403d0799dd7a"},
     {file = "aws_xray_sdk-2.10.0-py2.py3-none-any.whl", hash = "sha256:7551e81a796e1a5471ebe84844c40e8edf7c218db33506d046fec61f7495eda4"},
@@ -358,6 +372,7 @@ version = "2.2.1"
 description = "Function decoration for backoff and retry"
 optional = false
 python-versions = ">=3.7,<4.0"
+groups = ["main"]
 files = [
     {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"},
     {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
@@ -369,6 +384,7 @@ version = "1.34.11"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
     {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
@@ -388,6 +404,7 @@ version = "1.26.16"
 description = "Type annotations for boto3 1.26.16 generated with mypy-boto3-builder 7.11.11"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "boto3-stubs-1.26.16.tar.gz", hash = "sha256:618253ae19f1480785759bcaee8c8b10ed3fc037027247c26a3461a50f58406d"},
     {file = "boto3_stubs-1.26.16-py3-none-any.whl", hash = "sha256:8cf2925bc3e1349c93eb0f49c1061affc5ca314d69eeb335349037969d0787ed"},
@@ -732,6 +749,7 @@ version = "1.34.11"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
     {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
@@ -751,6 +769,7 @@ version = "1.27.38"
 description = "Type annotations for botocore 1.27.38 generated with mypy-boto3-builder 7.10.1"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"},
     {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"},
@@ -765,6 +784,7 @@ version = "2024.7.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
     {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
@@ -776,6 +796,7 @@ version = "1.17.1"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
     {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
@@ -855,6 +876,7 @@ version = "0.87.1"
 description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved"
 optional = false
 python-versions = "!=4.0,<=4.0,>=3.8"
+groups = ["main"]
 files = [
     {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"},
     {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"},
@@ -878,6 +900,7 @@ version = "2.1.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.6.0"
+groups = ["main"]
 files = [
     {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"},
     {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"},
@@ -892,6 +915,7 @@ version = "8.1.3"
 description = "Composable command line interface toolkit"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
     {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
@@ -906,6 +930,7 @@ version = "0.7.17"
 description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
 optional = false
 python-versions = "~=3.8"
+groups = ["main"]
 files = [
     {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
     {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
@@ -996,6 +1021,8 @@ version = "0.4.5"
 description = "Cross-platform colored terminal text."
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
+markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
 files = [
     {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"},
     {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"},
@@ -1007,6 +1034,7 @@ version = "43.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
     {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
@@ -1056,6 +1084,7 @@ version = "7.1.0"
 description = "A Python library for the Docker Engine API."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
     {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
@@ -1078,6 +1107,7 @@ version = "1.9.0"
 description = "execnet: rapid multi-Python deployment"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"},
     {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"},
@@ -1092,6 +1122,7 @@ version = "2.2.5"
 description = "A simple framework for building complex web applications."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"},
     {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"},
@@ -1113,6 +1144,7 @@ version = "5.0.0"
 description = "A Flask extension adding a decorator for CORS support"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
     {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
@@ -1127,6 +1159,7 @@ version = "1.5.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"},
     {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"},
@@ -1228,6 +1261,7 @@ version = "3.2.1"
 description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL."
 optional = false
 python-versions = ">=3.6,<4"
+groups = ["main"]
 files = [
     {file = "graphql-core-3.2.1.tar.gz", hash = "sha256:9d1bf141427b7d54be944587c8349df791ce60ade2e3cccaf9c56368c133c201"},
     {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
@@ -1239,6 +1273,7 @@ version = "0.14.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
     {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
@@ -1247,27 +1282,33 @@ files = [
 [[package]]
 name = "h2"
 version = "4.1.0"
-description = "HTTP/2 State-Machine based protocol implementation"
+description = "Pure-Python HTTP/2 protocol implementation"
 optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
-    {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
-]
+python-versions = ">=3.9"
+groups = ["main"]
+files = []
+develop = false
 
 [package.dependencies]
-hpack = ">=4.0,<5"
-hyperframe = ">=6.0,<7"
+hpack = ">=4.1,<5"
+hyperframe = ">=6.1,<7"
+
+[package.source]
+type = "git"
+url = "https://github.com/python-hyper/h2"
+reference = "HEAD"
+resolved_reference = "0b98b244b5fd1fe96100ac14905417a3b70a4286"
 
 [[package]]
 name = "hpack"
-version = "4.0.0"
-description = "Pure-Python HPACK header compression"
+version = "4.1.0"
+description = "Pure-Python HPACK header encoding"
 optional = false
-python-versions = ">=3.6.1"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
-    {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
+    {file = "hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496"},
+    {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
 ]
 
 [[package]]
@@ -1276,6 +1317,7 @@ version = "1.0.3"
 description = "A minimal low-level HTTP client."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"},
     {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"},
@@ -1297,6 +1339,7 @@ version = "0.26.0"
 description = "The next generation HTTP client."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
     {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
@@ -1318,13 +1361,14 @@ socks = ["socksio (==1.*)"]
 
 [[package]]
 name = "hyperframe"
-version = "6.0.1"
-description = "HTTP/2 framing layer for Python"
+version = "6.1.0"
+description = "Pure-Python HTTP/2 framing"
 optional = false
-python-versions = ">=3.6.1"
+python-versions = ">=3.9"
+groups = ["main"]
 files = [
-    {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
-    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
+    {file = "hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5"},
+    {file = "hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08"},
 ]
 
 [[package]]
@@ -1333,6 +1377,7 @@ version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
+groups = ["main"]
 files = [
     {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
     {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
@@ -1344,6 +1389,7 @@ version = "1.1.1"
 description = "iniconfig: brain-dead simple config-ini parsing"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
@@ -1355,6 +1401,7 @@ version = "2.1.2"
 description = "Safely pass data to untrusted environments and back."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"},
     {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"},
@@ -1366,6 +1413,7 @@ version = "3.1.5"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"},
     {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"},
@@ -1383,6 +1431,7 @@ version = "1.0.1"
 description = "JSON Matching Expressions"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"},
     {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"},
@@ -1394,6 +1443,7 @@ version = "0.9.0"
 description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"},
     {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"},
@@ -1411,6 +1461,7 @@ version = "1.2.3"
 description = "Generate source code for Python classes from a JSON schema."
 optional = false
 python-versions = ">= 2.7"
+groups = ["main"]
 files = [
     {file = "jschema_to_python-1.2.3-py3-none-any.whl", hash = "sha256:8a703ca7604d42d74b2815eecf99a33359a8dccbb80806cce386d5e2dd992b05"},
     {file = "jschema_to_python-1.2.3.tar.gz", hash = "sha256:76ff14fe5d304708ccad1284e4b11f96a658949a31ee7faed9e0995279549b91"},
@@ -1427,6 +1478,7 @@ version = "2.0.0"
 description = "Diff JSON and JSON-like structures in Python"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "jsondiff-2.0.0-py3-none-any.whl", hash = "sha256:689841d66273fc88fc79f7d33f4c074774f4f214b6466e3aff0e5adaf889d1e0"},
     {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"},
@@ -1438,6 +1490,8 @@ version = "0.20.0"
 description = "Python bindings for Jsonnet - The data templating language"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "python_version < \"3.13\""
 files = [
     {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"},
 ]
@@ -1448,6 +1502,7 @@ version = "1.32"
 description = "Apply JSON-Patches (RFC 6902)"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"},
     {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"},
@@ -1462,6 +1517,7 @@ version = "1.6.1"
 description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming."
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"},
     {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"},
@@ -1476,6 +1532,7 @@ version = "2.2.0"
 description = "Python library for serializing any arbitrary object graph into JSON"
 optional = false
 python-versions = ">=2.7"
+groups = ["main"]
 files = [
     {file = "jsonpickle-2.2.0-py2.py3-none-any.whl", hash = "sha256:de7f2613818aa4f234138ca11243d6359ff83ae528b2185efdd474f62bcf9ae1"},
     {file = "jsonpickle-2.2.0.tar.gz", hash = "sha256:7b272918b0554182e53dc340ddd62d9b7f902fec7e7b05620c04f3ccef479a0e"},
@@ -1492,6 +1549,7 @@ version = "2.3"
 description = "Identify specific nodes in a JSON document (RFC 6901)"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
 files = [
     {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"},
     {file = "jsonpointer-2.3.tar.gz", hash = "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"},
@@ -1503,6 +1561,7 @@ version = "4.17.3"
 description = "An implementation of JSON Schema validation for Python"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"},
     {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"},
@@ -1522,6 +1581,7 @@ version = "0.1.6"
 description = "JSONSchema Spec with object-oriented paths"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"},
     {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"},
@@ -1539,6 +1599,7 @@ version = "1.9"
 description = "Creates JUnit XML test result documents that can be read by tools such as Jenkins"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
     {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
@@ -1553,6 +1614,7 @@ version = "1.5.6"
 description = "Implementation of JOSE Web standards"
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"},
     {file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"},
@@ -1568,6 +1630,7 @@ version = "2.0.2"
 description = "Pure Python client for Apache Kafka"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
     {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
@@ -1582,6 +1645,7 @@ version = "1.10.0"
 description = "A fast and thorough lazy object proxy."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"},
     {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"},
@@ -1628,6 +1692,7 @@ version = "4.3.3"
 description = "LZ4 Bindings for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
     {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
@@ -1678,6 +1743,7 @@ version = "2.1.1"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"},
     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"},
@@ -1727,6 +1793,7 @@ version = "5.0.6"
 description = ""
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"},
     {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"},
@@ -1786,6 +1853,7 @@ version = "1.3.0"
 description = "Python library for arbitrary-precision floating-point arithmetic"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
     {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
@@ -1803,6 +1871,7 @@ version = "6.0.5"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
     {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
@@ -1902,6 +1971,7 @@ version = "1.13.0"
 description = "Optional static typing for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["dev"]
 files = [
     {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"},
     {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"},
@@ -1954,6 +2024,7 @@ version = "1.26.0.post1"
 description = "Type annotations for boto3.S3 1.26.0 service generated with mypy-boto3-builder 7.11.10"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"},
     {file = "mypy_boto3_s3-1.26.0.post1-py3-none-any.whl", hash = "sha256:7de2792ff0cc541b84cd46ff3a6aa2b6e5f267217f2203f27f6e4016bddc644d"},
@@ -1968,6 +2039,7 @@ version = "1.0.0"
 description = "Type system extensions for programs checked with the mypy type checker."
 optional = false
 python-versions = ">=3.5"
+groups = ["dev"]
 files = [
     {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
     {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
@@ -1979,6 +2051,7 @@ version = "2.8.5"
 description = "Python package for creating and manipulating graphs and networks"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "networkx-2.8.5-py3-none-any.whl", hash = "sha256:a762f4b385692d9c3a6f2912d058d76d29a827deaedf9e63ed14d397b8030687"},
     {file = "networkx-2.8.5.tar.gz", hash = "sha256:15a7b81a360791c458c55a417418ea136c13378cfdc06a2dcdc12bd2f9cf09c1"},
@@ -1997,6 +2070,7 @@ version = "0.4.4"
 description = "OpenAPI schema validation for Python"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"},
     {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"},
@@ -2015,6 +2089,7 @@ version = "0.5.7"
 description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"},
     {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"},
@@ -2032,6 +2107,7 @@ version = "24.2"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
     {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
@@ -2043,6 +2119,7 @@ version = "0.4.3"
 description = "Object-oriented paths"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
+groups = ["main"]
 files = [
     {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"},
     {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"},
@@ -2054,6 +2131,7 @@ version = "5.9.0"
 description = "Python Build Reasonableness"
 optional = false
 python-versions = ">=2.6"
+groups = ["main"]
 files = [
     {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"},
     {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
@@ -2065,6 +2143,7 @@ version = "1.0.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
@@ -2080,6 +2159,7 @@ version = "3.11"
 description = "Python Lex & Yacc"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
     {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
@@ -2091,6 +2171,7 @@ version = "0.14.1"
 description = "Python client for the Prometheus monitoring system."
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"},
     {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"},
@@ -2105,6 +2186,7 @@ version = "0.2.0"
 description = "Accelerated property cache"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
     {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
@@ -2212,6 +2294,7 @@ version = "5.9.4"
 description = "Cross-platform lib for process and system monitoring in Python."
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
 files = [
     {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"},
     {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"},
@@ -2238,6 +2321,7 @@ version = "2.9.10"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"},
     {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"},
@@ -2286,6 +2370,7 @@ files = [
     {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
     {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
     {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"},
     {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
     {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
     {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
@@ -2314,6 +2399,7 @@ version = "0.5.4"
 description = "Pure Python PartiQL Parser"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"},
     {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"},
@@ -2328,6 +2414,7 @@ version = "2.21"
 description = "C parser in Python"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
 files = [
     {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
@@ -2339,6 +2426,7 @@ version = "2.10.4"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d"},
     {file = "pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06"},
@@ -2359,6 +2447,7 @@ version = "2.27.2"
 description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"},
     {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"},
@@ -2471,6 +2560,7 @@ version = "2.4.0"
 description = "JSON Web Token implementation in Python"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"},
     {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"},
@@ -2491,6 +2581,7 @@ version = "3.0.9"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
 optional = false
 python-versions = ">=3.6.8"
+groups = ["main"]
 files = [
     {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
     {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
@@ -2505,6 +2596,7 @@ version = "0.18.1"
 description = "Persistent/Functional/Immutable data structures"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"},
     {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"},
@@ -2535,6 +2627,7 @@ version = "7.4.4"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
     {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
@@ -2555,6 +2648,7 @@ version = "0.21.0"
 description = "Pytest support for asyncio"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest-asyncio-0.21.0.tar.gz", hash = "sha256:2b38a496aef56f56b0e87557ec313e11e1ab9276fc3863f6a7be0f1d0e415e1b"},
     {file = "pytest_asyncio-0.21.0-py3-none-any.whl", hash = "sha256:f2b3366b7cd501a4056858bd39349d5af19742aed2d81660b7998b6341c7eb9c"},
@@ -2573,6 +2667,7 @@ version = "1.0.8"
 description = "pytest-httpserver is a httpserver for pytest"
 optional = false
 python-versions = ">=3.8,<4.0"
+groups = ["main"]
 files = [
     {file = "pytest_httpserver-1.0.8-py3-none-any.whl", hash = "sha256:24cd3d9f6a0b927c7bfc400d0b3fda7442721b8267ce29942bf307b190f0bb09"},
     {file = "pytest_httpserver-1.0.8.tar.gz", hash = "sha256:e052f69bc8a9073db02484681e8e47004dd1fb3763b0ae833bd899e5895c559a"},
@@ -2587,6 +2682,7 @@ version = "0.6.3"
 description = "It helps to use fixtures in pytest.mark.parametrize"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
     {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
@@ -2601,6 +2697,7 @@ version = "1.1.0"
 description = "pytest plugin to run your tests in a specific order"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
     {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
@@ -2615,6 +2712,7 @@ version = "0.9.3"
 description = "pytest plugin for repeating tests"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
     {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
@@ -2629,6 +2727,7 @@ version = "15.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"},
     {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"},
@@ -2644,6 +2743,7 @@ version = "0.8.1"
 description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time."
 optional = false
 python-versions = ">=3.7.1,<4.0"
+groups = ["main"]
 files = [
     {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"},
     {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"},
@@ -2658,6 +2758,7 @@ version = "2.1.0"
 description = "pytest plugin to abort hanging tests"
 optional = false
 python-versions = ">=3.6"
+groups = ["main"]
 files = [
     {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"},
     {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
@@ -2672,6 +2773,7 @@ version = "3.3.1"
 description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "pytest-xdist-3.3.1.tar.gz", hash = "sha256:d5ee0520eb1b7bcca50a60a518ab7a7707992812c578198f8b44fdfac78e8c93"},
     {file = "pytest_xdist-3.3.1-py3-none-any.whl", hash = "sha256:ff9daa7793569e6a68544850fd3927cd257cc03a7ef76c95e86915355e82b5f2"},
@@ -2692,6 +2794,7 @@ version = "2.8.2"
 description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+groups = ["main"]
 files = [
     {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
     {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
@@ -2706,6 +2809,7 @@ version = "1.0.1"
 description = "Read key-value pairs from a .env file and set them as environment variables"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
     {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
@@ -2720,6 +2824,7 @@ version = "2024.1"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
     {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
@@ -2731,6 +2836,8 @@ version = "308"
 description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
+groups = ["main"]
+markers = "sys_platform == \"win32\""
 files = [
     {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
     {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
@@ -2758,6 +2865,7 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
     {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -2820,6 +2928,7 @@ version = "2024.4.28"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
     {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
@@ -2908,6 +3017,7 @@ version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
     {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
@@ -2929,6 +3039,7 @@ version = "0.25.3"
 description = "A utility library for mocking out the `requests` Python library."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "responses-0.25.3-py3-none-any.whl", hash = "sha256:521efcbc82081ab8daa588e08f7e8a64ce79b91c39f6e62199b19159bea7dbcb"},
     {file = "responses-0.25.3.tar.gz", hash = "sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba"},
@@ -2948,6 +3059,7 @@ version = "0.1.4"
 description = "A pure python RFC3339 validator"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+groups = ["main"]
 files = [
     {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"},
     {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"},
@@ -2962,6 +3074,7 @@ version = "0.7.0"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
+groups = ["dev"]
 files = [
     {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"},
     {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"},
@@ -2989,6 +3102,7 @@ version = "0.10.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
 python-versions = ">= 3.8"
+groups = ["main"]
 files = [
     {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
     {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
@@ -3006,6 +3120,7 @@ version = "1.0.4"
 description = "Classes implementing the SARIF 2.1.0 object model."
 optional = false
 python-versions = ">= 2.7"
+groups = ["main"]
 files = [
     {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
@@ -3021,6 +3136,7 @@ version = "70.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
     {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
@@ -3036,6 +3152,7 @@ version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+groups = ["main"]
 files = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -3047,6 +3164,7 @@ version = "1.3.0"
 description = "Sniff out which async library your code is running under"
 optional = false
 python-versions = ">=3.7"
+groups = ["main"]
 files = [
     {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
@@ -3058,6 +3176,7 @@ version = "1.12"
 description = "Computer algebra system (CAS) in Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
     {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
@@ -3072,6 +3191,7 @@ version = "4.9.0"
 description = "Python library for throwaway instances of anything that can run in a Docker container"
 optional = false
 python-versions = "<4.0,>=3.9"
+groups = ["main"]
 files = [
     {file = "testcontainers-4.9.0-py3-none-any.whl", hash = "sha256:c6fee929990972c40bf6b91b7072c94064ff3649b405a14fde0274c8b2479d32"},
     {file = "testcontainers-4.9.0.tar.gz", hash = "sha256:2cd6af070109ff68c1ab5389dc89c86c2dc3ab30a21ca734b2cb8f0f80ad479e"},
@@ -3125,6 +3245,7 @@ version = "0.10.2"
 description = "Python Library for Tom's Obvious, Minimal Language"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+groups = ["main"]
 files = [
     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
@@ -3136,6 +3257,7 @@ version = "1.5.0.20240925"
 description = "Typing stubs for jwcrypto"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"},
     {file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"},
@@ -3150,6 +3272,7 @@ version = "5.9.5.12"
 description = "Typing stubs for psutil"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-psutil-5.9.5.12.tar.gz", hash = "sha256:61a91679d3fe737250013b624dca09375e7cc3ad77dcc734553746c429c02aca"},
     {file = "types_psutil-5.9.5.12-py3-none-any.whl", hash = "sha256:e9a147b8561235c6afcce5aa1adb973fad9ab2c50cf89820697687f53510358f"},
@@ -3161,6 +3284,7 @@ version = "2.9.21.20241019"
 description = "Typing stubs for psycopg2"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"},
     {file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"},
@@ -3172,6 +3296,7 @@ version = "0.6.3.3"
 description = "Typing stubs for pytest-lazy-fixture"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-pytest-lazy-fixture-0.6.3.3.tar.gz", hash = "sha256:2ef79d66bcde0e50acdac8dc55074b9ae0d4cfaeabdd638f5522f4cac7c8a2c7"},
     {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"},
@@ -3183,6 +3308,7 @@ version = "6.0.12.20240917"
 description = "Typing stubs for PyYAML"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "types-PyYAML-6.0.12.20240917.tar.gz", hash = "sha256:d1405a86f9576682234ef83bcb4e6fff7c9305c8b1fbad5e0bcd4f7dbdc9c587"},
     {file = "types_PyYAML-6.0.12.20240917-py3-none-any.whl", hash = "sha256:392b267f1c0fe6022952462bf5d6523f31e37f6cea49b14cee7ad634b6301570"},
@@ -3194,6 +3320,7 @@ version = "2.31.0.0"
 description = "Typing stubs for requests"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-requests-2.31.0.0.tar.gz", hash = "sha256:c1c29d20ab8d84dff468d7febfe8e0cb0b4664543221b386605e14672b44ea25"},
     {file = "types_requests-2.31.0.0-py3-none-any.whl", hash = "sha256:7c5cea7940f8e92ec560bbc468f65bf684aa3dcf0554a6f8c4710f5f708dc598"},
@@ -3208,6 +3335,7 @@ version = "0.6.0.post3"
 description = "Type annotations and code completion for s3transfer"
 optional = false
 python-versions = ">=3.7,<4.0"
+groups = ["main"]
 files = [
     {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
     {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
@@ -3219,6 +3347,7 @@ version = "0.10.8.6"
 description = "Typing stubs for toml"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-toml-0.10.8.6.tar.gz", hash = "sha256:6d3ac79e36c9ee593c5d4fb33a50cca0e3adceb6ef5cff8b8e5aef67b4c4aaf2"},
     {file = "types_toml-0.10.8.6-py3-none-any.whl", hash = "sha256:de7b2bb1831d6f7a4b554671ffe5875e729753496961b3e9b202745e4955dafa"},
@@ -3230,6 +3359,7 @@ version = "1.26.17"
 description = "Typing stubs for urllib3"
 optional = false
 python-versions = "*"
+groups = ["main"]
 files = [
     {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
     {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
@@ -3241,6 +3371,7 @@ version = "4.12.2"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
+groups = ["main", "dev"]
 files = [
     {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
     {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
@@ -3252,6 +3383,7 @@ version = "1.26.19"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+groups = ["main"]
 files = [
     {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"},
     {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"},
@@ -3268,6 +3400,7 @@ version = "12.0"
 description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
     {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
@@ -3349,6 +3482,7 @@ version = "3.0.6"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
     {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
@@ -3366,6 +3500,7 @@ version = "1.14.1"
 description = "Module for decorators, wrappers and monkey patching."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+groups = ["main"]
 files = [
     {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"},
     {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"},
@@ -3386,6 +3521,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3439,6 +3584,7 @@ version = "0.13.0"
 description = "Makes working with XML feel like you are working with JSON"
 optional = false
 python-versions = ">=3.4"
+groups = ["main"]
 files = [
     {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"},
     {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"},
@@ -3450,6 +3596,7 @@ version = "1.17.2"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.9"
+groups = ["main"]
 files = [
     {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:93771146ef048b34201bfa382c2bf74c524980870bb278e6df515efaf93699ff"},
     {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8281db240a1616af2f9c5f71d355057e73a1409c4648c8949901396dc0a3c151"},
@@ -3546,6 +3693,7 @@ version = "0.23.0"
 description = "Zstandard bindings for Python"
 optional = false
 python-versions = ">=3.8"
+groups = ["main"]
 files = [
     {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"},
     {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"},
@@ -3653,6 +3801,6 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]
 
 [metadata]
-lock-version = "2.0"
+lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "e6904aca09abc6c805604b21a5702a97e0056406f9ec7469b091d35ee10a6b16"
+content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7"
diff --git a/pyproject.toml b/pyproject.toml
index 735d12d756..e299c421e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
 kafka-python = "^2.0.2"
 jwcrypto = "^1.5.6"
-h2 = "^4.1.0"
+h2 = {git = "https://github.com/python-hyper/h2"}
 types-jwcrypto = "^1.5.0.20240925"
 pyyaml = "^6.0.2"
 types-pyyaml = "^6.0.12.20240917"
@@ -94,6 +94,7 @@ target-version = "py311"
 extend-exclude = [
     "vendor/",
     "target/",
+    "test_runner/stubs/", # Autogenerated by mypy's stubgen
 ]
 line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
 
diff --git a/test_runner/stubs/h2/__init__.pyi b/test_runner/stubs/h2/__init__.pyi
index e69de29bb2..bda5b5a7f4 100644
--- a/test_runner/stubs/h2/__init__.pyi
+++ b/test_runner/stubs/h2/__init__.pyi
@@ -0,0 +1 @@
+__version__: str
diff --git a/test_runner/stubs/h2/config.pyi b/test_runner/stubs/h2/config.pyi
index 710005db69..422344b981 100644
--- a/test_runner/stubs/h2/config.pyi
+++ b/test_runner/stubs/h2/config.pyi
@@ -1,11 +1,12 @@
 from _typeshed import Incomplete
+from typing import Any
 
 class _BooleanConfigOption:
     name: Incomplete
     attr_name: Incomplete
-    def __init__(self, name) -> None: ...
-    def __get__(self, instance, owner): ...
-    def __set__(self, instance, value) -> None: ...
+    def __init__(self, name: str) -> None: ...
+    def __get__(self, instance: Any, owner: Any) -> bool: ...
+    def __set__(self, instance: Any, value: bool) -> None: ...
 
 class DummyLogger:
     def __init__(self, *vargs) -> None: ...
@@ -15,7 +16,7 @@ class DummyLogger:
 class OutputLogger:
     file: Incomplete
     trace_level: Incomplete
-    def __init__(self, file: Incomplete | None = ..., trace_level: bool = ...) -> None: ...
+    def __init__(self, file: Incomplete | None = None, trace_level: bool = False) -> None: ...
     def debug(self, fmtstr, *args) -> None: ...
     def trace(self, fmtstr, *args) -> None: ...
 
@@ -23,20 +24,12 @@ class H2Configuration:
     client_side: Incomplete
     validate_outbound_headers: Incomplete
     normalize_outbound_headers: Incomplete
+    split_outbound_cookies: Incomplete
     validate_inbound_headers: Incomplete
     normalize_inbound_headers: Incomplete
     logger: Incomplete
-    def __init__(
-        self,
-        client_side: bool = ...,
-        header_encoding: Incomplete | None = ...,
-        validate_outbound_headers: bool = ...,
-        normalize_outbound_headers: bool = ...,
-        validate_inbound_headers: bool = ...,
-        normalize_inbound_headers: bool = ...,
-        logger: Incomplete | None = ...,
-    ) -> None: ...
+    def __init__(self, client_side: bool = True, header_encoding: bool | str | None = None, validate_outbound_headers: bool = True, normalize_outbound_headers: bool = True, split_outbound_cookies: bool = False, validate_inbound_headers: bool = True, normalize_inbound_headers: bool = True, logger: DummyLogger | OutputLogger | None = None) -> None: ...
     @property
-    def header_encoding(self): ...
+    def header_encoding(self) -> bool | str | None: ...
     @header_encoding.setter
-    def header_encoding(self, value) -> None: ...
+    def header_encoding(self, value: bool | str | None) -> None: ...
diff --git a/test_runner/stubs/h2/connection.pyi b/test_runner/stubs/h2/connection.pyi
index 04be18ca74..f7ec78a997 100644
--- a/test_runner/stubs/h2/connection.pyi
+++ b/test_runner/stubs/h2/connection.pyi
@@ -1,72 +1,55 @@
-from enum import Enum, IntEnum
-
-from _typeshed import Incomplete
-
 from .config import H2Configuration as H2Configuration
 from .errors import ErrorCodes as ErrorCodes
-from .events import AlternativeServiceAvailable as AlternativeServiceAvailable
-from .events import ConnectionTerminated as ConnectionTerminated
-from .events import PingAckReceived as PingAckReceived
-from .events import PingReceived as PingReceived
-from .events import PriorityUpdated as PriorityUpdated
-from .events import RemoteSettingsChanged as RemoteSettingsChanged
-from .events import SettingsAcknowledged as SettingsAcknowledged
-from .events import UnknownFrameReceived as UnknownFrameReceived
-from .events import WindowUpdated as WindowUpdated
-from .exceptions import DenialOfServiceError as DenialOfServiceError
-from .exceptions import FlowControlError as FlowControlError
-from .exceptions import FrameTooLargeError as FrameTooLargeError
-from .exceptions import NoAvailableStreamIDError as NoAvailableStreamIDError
-from .exceptions import NoSuchStreamError as NoSuchStreamError
-from .exceptions import ProtocolError as ProtocolError
-from .exceptions import RFC1122Error as RFC1122Error
-from .exceptions import StreamClosedError as StreamClosedError
-from .exceptions import StreamIDTooLowError as StreamIDTooLowError
-from .exceptions import TooManyStreamsError as TooManyStreamsError
+from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, ConnectionTerminated as ConnectionTerminated, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PingAckReceived as PingAckReceived, PingReceived as PingReceived, PriorityUpdated as PriorityUpdated, RemoteSettingsChanged as RemoteSettingsChanged, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, SettingsAcknowledged as SettingsAcknowledged, TrailersReceived as TrailersReceived, UnknownFrameReceived as UnknownFrameReceived, WindowUpdated as WindowUpdated
+from .exceptions import DenialOfServiceError as DenialOfServiceError, FlowControlError as FlowControlError, FrameTooLargeError as FrameTooLargeError, NoAvailableStreamIDError as NoAvailableStreamIDError, NoSuchStreamError as NoSuchStreamError, ProtocolError as ProtocolError, RFC1122Error as RFC1122Error, StreamClosedError as StreamClosedError, StreamIDTooLowError as StreamIDTooLowError, TooManyStreamsError as TooManyStreamsError
 from .frame_buffer import FrameBuffer as FrameBuffer
-from .settings import SettingCodes as SettingCodes
-from .settings import Settings as Settings
-from .stream import H2Stream as H2Stream
-from .stream import StreamClosedBy as StreamClosedBy
-from .utilities import guard_increment_window as guard_increment_window
+from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings
+from .stream import H2Stream as H2Stream, StreamClosedBy as StreamClosedBy
+from .utilities import SizeLimitDict as SizeLimitDict, guard_increment_window as guard_increment_window
 from .windows import WindowManager as WindowManager
+from _typeshed import Incomplete
+from collections.abc import Iterable
+from enum import Enum, IntEnum
+from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped
+from hyperframe.frame import Frame as Frame
+from typing import Any
 
 class ConnectionState(Enum):
-    IDLE: int
-    CLIENT_OPEN: int
-    SERVER_OPEN: int
-    CLOSED: int
+    IDLE = 0
+    CLIENT_OPEN = 1
+    SERVER_OPEN = 2
+    CLOSED = 3
 
 class ConnectionInputs(Enum):
-    SEND_HEADERS: int
-    SEND_PUSH_PROMISE: int
-    SEND_DATA: int
-    SEND_GOAWAY: int
-    SEND_WINDOW_UPDATE: int
-    SEND_PING: int
-    SEND_SETTINGS: int
-    SEND_RST_STREAM: int
-    SEND_PRIORITY: int
-    RECV_HEADERS: int
-    RECV_PUSH_PROMISE: int
-    RECV_DATA: int
-    RECV_GOAWAY: int
-    RECV_WINDOW_UPDATE: int
-    RECV_PING: int
-    RECV_SETTINGS: int
-    RECV_RST_STREAM: int
-    RECV_PRIORITY: int
-    SEND_ALTERNATIVE_SERVICE: int
-    RECV_ALTERNATIVE_SERVICE: int
+    SEND_HEADERS = 0
+    SEND_PUSH_PROMISE = 1
+    SEND_DATA = 2
+    SEND_GOAWAY = 3
+    SEND_WINDOW_UPDATE = 4
+    SEND_PING = 5
+    SEND_SETTINGS = 6
+    SEND_RST_STREAM = 7
+    SEND_PRIORITY = 8
+    RECV_HEADERS = 9
+    RECV_PUSH_PROMISE = 10
+    RECV_DATA = 11
+    RECV_GOAWAY = 12
+    RECV_WINDOW_UPDATE = 13
+    RECV_PING = 14
+    RECV_SETTINGS = 15
+    RECV_RST_STREAM = 16
+    RECV_PRIORITY = 17
+    SEND_ALTERNATIVE_SERVICE = 18
+    RECV_ALTERNATIVE_SERVICE = 19
 
 class AllowedStreamIDs(IntEnum):
-    EVEN: int
-    ODD: int
+    EVEN = 0
+    ODD = 1
 
 class H2ConnectionStateMachine:
     state: Incomplete
     def __init__(self) -> None: ...
-    def process_input(self, input_): ...
+    def process_input(self, input_: ConnectionInputs) -> list[Event]: ...
 
 class H2Connection:
     DEFAULT_MAX_OUTBOUND_FRAME_SIZE: int
@@ -88,55 +71,30 @@ class H2Connection:
     max_outbound_frame_size: Incomplete
     max_inbound_frame_size: Incomplete
     incoming_buffer: Incomplete
-    def __init__(self, config: Incomplete | None = ...) -> None: ...
+    def __init__(self, config: H2Configuration | None = None) -> None: ...
     @property
-    def open_outbound_streams(self): ...
+    def open_outbound_streams(self) -> int: ...
     @property
-    def open_inbound_streams(self): ...
+    def open_inbound_streams(self) -> int: ...
     @property
-    def inbound_flow_control_window(self): ...
+    def inbound_flow_control_window(self) -> int: ...
     def initiate_connection(self) -> None: ...
-    def initiate_upgrade_connection(self, settings_header: Incomplete | None = ...): ...
-    def get_next_available_stream_id(self): ...
-    def send_headers(
-        self,
-        stream_id,
-        headers,
-        end_stream: bool = ...,
-        priority_weight: Incomplete | None = ...,
-        priority_depends_on: Incomplete | None = ...,
-        priority_exclusive: Incomplete | None = ...,
-    ) -> None: ...
-    def send_data(
-        self, stream_id, data, end_stream: bool = ..., pad_length: Incomplete | None = ...
-    ) -> None: ...
-    def end_stream(self, stream_id) -> None: ...
-    def increment_flow_control_window(
-        self, increment, stream_id: Incomplete | None = ...
-    ) -> None: ...
-    def push_stream(self, stream_id, promised_stream_id, request_headers) -> None: ...
-    def ping(self, opaque_data) -> None: ...
-    def reset_stream(self, stream_id, error_code: int = ...) -> None: ...
-    def close_connection(
-        self,
-        error_code: int = ...,
-        additional_data: Incomplete | None = ...,
-        last_stream_id: Incomplete | None = ...,
-    ) -> None: ...
-    def update_settings(self, new_settings) -> None: ...
-    def advertise_alternative_service(
-        self, field_value, origin: Incomplete | None = ..., stream_id: Incomplete | None = ...
-    ) -> None: ...
-    def prioritize(
-        self,
-        stream_id,
-        weight: Incomplete | None = ...,
-        depends_on: Incomplete | None = ...,
-        exclusive: Incomplete | None = ...,
-    ) -> None: ...
-    def local_flow_control_window(self, stream_id): ...
-    def remote_flow_control_window(self, stream_id): ...
-    def acknowledge_received_data(self, acknowledged_size, stream_id) -> None: ...
-    def data_to_send(self, amount: Incomplete | None = ...): ...
+    def initiate_upgrade_connection(self, settings_header: bytes | None = None) -> bytes | None: ...
+    def get_next_available_stream_id(self) -> int: ...
+    def send_headers(self, stream_id: int, headers: Iterable[HeaderWeaklyTyped], end_stream: bool = False, priority_weight: int | None = None, priority_depends_on: int | None = None, priority_exclusive: bool | None = None) -> None: ...
+    def send_data(self, stream_id: int, data: bytes | memoryview, end_stream: bool = False, pad_length: Any = None) -> None: ...
+    def end_stream(self, stream_id: int) -> None: ...
+    def increment_flow_control_window(self, increment: int, stream_id: int | None = None) -> None: ...
+    def push_stream(self, stream_id: int, promised_stream_id: int, request_headers: Iterable[HeaderWeaklyTyped]) -> None: ...
+    def ping(self, opaque_data: bytes | str) -> None: ...
+    def reset_stream(self, stream_id: int, error_code: ErrorCodes | int = 0) -> None: ...
+    def close_connection(self, error_code: ErrorCodes | int = 0, additional_data: bytes | None = None, last_stream_id: int | None = None) -> None: ...
+    def update_settings(self, new_settings: dict[SettingCodes | int, int]) -> None: ...
+    def advertise_alternative_service(self, field_value: bytes | str, origin: bytes | None = None, stream_id: int | None = None) -> None: ...
+    def prioritize(self, stream_id: int, weight: int | None = None, depends_on: int | None = None, exclusive: bool | None = None) -> None: ...
+    def local_flow_control_window(self, stream_id: int) -> int: ...
+    def remote_flow_control_window(self, stream_id: int) -> int: ...
+    def acknowledge_received_data(self, acknowledged_size: int, stream_id: int) -> None: ...
+    def data_to_send(self, amount: int | None = None) -> bytes: ...
     def clear_outbound_data_buffer(self) -> None: ...
-    def receive_data(self, data): ...
+    def receive_data(self, data: bytes) -> list[Event]: ...
diff --git a/test_runner/stubs/h2/errors.pyi b/test_runner/stubs/h2/errors.pyi
index b70c632f8c..7cf77bd833 100644
--- a/test_runner/stubs/h2/errors.pyi
+++ b/test_runner/stubs/h2/errors.pyi
@@ -1,17 +1,19 @@
 import enum
 
+__all__ = ['ErrorCodes']
+
 class ErrorCodes(enum.IntEnum):
-    NO_ERROR: int
-    PROTOCOL_ERROR: int
-    INTERNAL_ERROR: int
-    FLOW_CONTROL_ERROR: int
-    SETTINGS_TIMEOUT: int
-    STREAM_CLOSED: int
-    FRAME_SIZE_ERROR: int
-    REFUSED_STREAM: int
-    CANCEL: int
-    COMPRESSION_ERROR: int
-    CONNECT_ERROR: int
-    ENHANCE_YOUR_CALM: int
-    INADEQUATE_SECURITY: int
-    HTTP_1_1_REQUIRED: int
+    NO_ERROR = 0
+    PROTOCOL_ERROR = 1
+    INTERNAL_ERROR = 2
+    FLOW_CONTROL_ERROR = 3
+    SETTINGS_TIMEOUT = 4
+    STREAM_CLOSED = 5
+    FRAME_SIZE_ERROR = 6
+    REFUSED_STREAM = 7
+    CANCEL = 8
+    COMPRESSION_ERROR = 9
+    CONNECT_ERROR = 10
+    ENHANCE_YOUR_CALM = 11
+    INADEQUATE_SECURITY = 12
+    HTTP_1_1_REQUIRED = 13
diff --git a/test_runner/stubs/h2/events.pyi b/test_runner/stubs/h2/events.pyi
index 75d0a9e53b..a086db38b3 100644
--- a/test_runner/stubs/h2/events.pyi
+++ b/test_runner/stubs/h2/events.pyi
@@ -1,6 +1,8 @@
+from .errors import ErrorCodes as ErrorCodes
+from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings
 from _typeshed import Incomplete
-
-from .settings import ChangedSetting as ChangedSetting
+from hpack import HeaderTuple as HeaderTuple
+from hyperframe.frame import Frame as Frame
 
 class Event: ...
 
@@ -53,7 +55,7 @@ class RemoteSettingsChanged(Event):
     changed_settings: Incomplete
     def __init__(self) -> None: ...
     @classmethod
-    def from_settings(cls, old_settings, new_settings): ...
+    def from_settings(cls, old_settings: Settings | dict[int, int], new_settings: dict[int, int]) -> RemoteSettingsChanged: ...
 
 class PingReceived(Event):
     ping_data: Incomplete
diff --git a/test_runner/stubs/h2/exceptions.pyi b/test_runner/stubs/h2/exceptions.pyi
index 82019d5ec1..7149b46521 100644
--- a/test_runner/stubs/h2/exceptions.pyi
+++ b/test_runner/stubs/h2/exceptions.pyi
@@ -1,3 +1,4 @@
+from .errors import ErrorCodes as ErrorCodes
 from _typeshed import Incomplete
 
 class H2Error(Exception): ...
@@ -19,27 +20,27 @@ class FlowControlError(ProtocolError):
 class StreamIDTooLowError(ProtocolError):
     stream_id: Incomplete
     max_stream_id: Incomplete
-    def __init__(self, stream_id, max_stream_id) -> None: ...
+    def __init__(self, stream_id: int, max_stream_id: int) -> None: ...
 
 class NoAvailableStreamIDError(ProtocolError): ...
 
 class NoSuchStreamError(ProtocolError):
     stream_id: Incomplete
-    def __init__(self, stream_id) -> None: ...
+    def __init__(self, stream_id: int) -> None: ...
 
 class StreamClosedError(NoSuchStreamError):
     stream_id: Incomplete
     error_code: Incomplete
-    def __init__(self, stream_id) -> None: ...
+    def __init__(self, stream_id: int) -> None: ...
 
 class InvalidSettingsValueError(ProtocolError, ValueError):
     error_code: Incomplete
-    def __init__(self, msg, error_code) -> None: ...
+    def __init__(self, msg: str, error_code: ErrorCodes) -> None: ...
 
 class InvalidBodyLengthError(ProtocolError):
     expected_length: Incomplete
     actual_length: Incomplete
-    def __init__(self, expected, actual) -> None: ...
+    def __init__(self, expected: int, actual: int) -> None: ...
 
 class UnsupportedFrameError(ProtocolError): ...
 class RFC1122Error(H2Error): ...
diff --git a/test_runner/stubs/h2/frame_buffer.pyi b/test_runner/stubs/h2/frame_buffer.pyi
index f47adab704..90746f63c1 100644
--- a/test_runner/stubs/h2/frame_buffer.pyi
+++ b/test_runner/stubs/h2/frame_buffer.pyi
@@ -1,19 +1,12 @@
-from .exceptions import (
-    FrameDataMissingError as FrameDataMissingError,
-)
-from .exceptions import (
-    FrameTooLargeError as FrameTooLargeError,
-)
-from .exceptions import (
-    ProtocolError as ProtocolError,
-)
+from .exceptions import FrameDataMissingError as FrameDataMissingError, FrameTooLargeError as FrameTooLargeError, ProtocolError as ProtocolError
+from hyperframe.frame import Frame
 
 CONTINUATION_BACKLOG: int
 
 class FrameBuffer:
     data: bytes
     max_frame_size: int
-    def __init__(self, server: bool = ...) -> None: ...
-    def add_data(self, data) -> None: ...
-    def __iter__(self): ...
-    def __next__(self): ...
+    def __init__(self, server: bool = False) -> None: ...
+    def add_data(self, data: bytes) -> None: ...
+    def __iter__(self) -> FrameBuffer: ...
+    def __next__(self) -> Frame: ...
diff --git a/test_runner/stubs/h2/settings.pyi b/test_runner/stubs/h2/settings.pyi
index a352abe53e..c3920f9969 100644
--- a/test_runner/stubs/h2/settings.pyi
+++ b/test_runner/stubs/h2/settings.pyi
@@ -1,61 +1,59 @@
 import enum
-from collections.abc import MutableMapping
-from typing import Any
-
+from .errors import ErrorCodes as ErrorCodes
+from .exceptions import InvalidSettingsValueError as InvalidSettingsValueError
 from _typeshed import Incomplete
-from h2.errors import ErrorCodes as ErrorCodes
-from h2.exceptions import InvalidSettingsValueError as InvalidSettingsValueError
+from collections.abc import Iterator, MutableMapping
 
 class SettingCodes(enum.IntEnum):
-    HEADER_TABLE_SIZE: Incomplete
-    ENABLE_PUSH: Incomplete
-    MAX_CONCURRENT_STREAMS: Incomplete
-    INITIAL_WINDOW_SIZE: Incomplete
-    MAX_FRAME_SIZE: Incomplete
-    MAX_HEADER_LIST_SIZE: Incomplete
-    ENABLE_CONNECT_PROTOCOL: Incomplete
+    HEADER_TABLE_SIZE = ...
+    ENABLE_PUSH = ...
+    MAX_CONCURRENT_STREAMS = ...
+    INITIAL_WINDOW_SIZE = ...
+    MAX_FRAME_SIZE = ...
+    MAX_HEADER_LIST_SIZE = ...
+    ENABLE_CONNECT_PROTOCOL = ...
 
 class ChangedSetting:
     setting: Incomplete
     original_value: Incomplete
     new_value: Incomplete
-    def __init__(self, setting, original_value, new_value) -> None: ...
+    def __init__(self, setting: SettingCodes | int, original_value: int | None, new_value: int) -> None: ...
 
-class Settings(MutableMapping[str, Any]):
-    def __init__(self, client: bool = ..., initial_values: Incomplete | None = ...) -> None: ...
-    def acknowledge(self): ...
+class Settings(MutableMapping[SettingCodes | int, int]):
+    def __init__(self, client: bool = True, initial_values: dict[SettingCodes, int] | None = None) -> None: ...
+    def acknowledge(self) -> dict[SettingCodes | int, ChangedSetting]: ...
     @property
-    def header_table_size(self): ...
+    def header_table_size(self) -> int: ...
     @header_table_size.setter
-    def header_table_size(self, value) -> None: ...
+    def header_table_size(self, value: int) -> None: ...
     @property
-    def enable_push(self): ...
+    def enable_push(self) -> int: ...
     @enable_push.setter
-    def enable_push(self, value) -> None: ...
+    def enable_push(self, value: int) -> None: ...
     @property
-    def initial_window_size(self): ...
+    def initial_window_size(self) -> int: ...
     @initial_window_size.setter
-    def initial_window_size(self, value) -> None: ...
+    def initial_window_size(self, value: int) -> None: ...
     @property
-    def max_frame_size(self): ...
+    def max_frame_size(self) -> int: ...
     @max_frame_size.setter
-    def max_frame_size(self, value) -> None: ...
+    def max_frame_size(self, value: int) -> None: ...
     @property
-    def max_concurrent_streams(self): ...
+    def max_concurrent_streams(self) -> int: ...
     @max_concurrent_streams.setter
-    def max_concurrent_streams(self, value) -> None: ...
+    def max_concurrent_streams(self, value: int) -> None: ...
     @property
-    def max_header_list_size(self): ...
+    def max_header_list_size(self) -> int | None: ...
     @max_header_list_size.setter
-    def max_header_list_size(self, value) -> None: ...
+    def max_header_list_size(self, value: int) -> None: ...
     @property
-    def enable_connect_protocol(self): ...
+    def enable_connect_protocol(self) -> int: ...
     @enable_connect_protocol.setter
-    def enable_connect_protocol(self, value) -> None: ...
-    def __getitem__(self, key): ...
-    def __setitem__(self, key, value) -> None: ...
-    def __delitem__(self, key) -> None: ...
-    def __iter__(self): ...
+    def enable_connect_protocol(self, value: int) -> None: ...
+    def __getitem__(self, key: SettingCodes | int) -> int: ...
+    def __setitem__(self, key: SettingCodes | int, value: int) -> None: ...
+    def __delitem__(self, key: SettingCodes | int) -> None: ...
+    def __iter__(self) -> Iterator[SettingCodes | int]: ...
     def __len__(self) -> int: ...
-    def __eq__(self, other): ...
-    def __ne__(self, other): ...
+    def __eq__(self, other: object) -> bool: ...
+    def __ne__(self, other: object) -> bool: ...
diff --git a/test_runner/stubs/h2/stream.pyi b/test_runner/stubs/h2/stream.pyi
index d52ab8e72b..89171da981 100644
--- a/test_runner/stubs/h2/stream.pyi
+++ b/test_runner/stubs/h2/stream.pyi
@@ -1,114 +1,52 @@
-from enum import Enum, IntEnum
-
-from _typeshed import Incomplete
-
+from .config import H2Configuration as H2Configuration
 from .errors import ErrorCodes as ErrorCodes
-from .events import (
-    AlternativeServiceAvailable as AlternativeServiceAvailable,
-)
-from .events import (
-    DataReceived as DataReceived,
-)
-from .events import (
-    InformationalResponseReceived as InformationalResponseReceived,
-)
-from .events import (
-    PushedStreamReceived as PushedStreamReceived,
-)
-from .events import (
-    RequestReceived as RequestReceived,
-)
-from .events import (
-    ResponseReceived as ResponseReceived,
-)
-from .events import (
-    StreamEnded as StreamEnded,
-)
-from .events import (
-    StreamReset as StreamReset,
-)
-from .events import (
-    TrailersReceived as TrailersReceived,
-)
-from .events import (
-    WindowUpdated as WindowUpdated,
-)
-from .exceptions import (
-    FlowControlError as FlowControlError,
-)
-from .exceptions import (
-    InvalidBodyLengthError as InvalidBodyLengthError,
-)
-from .exceptions import (
-    ProtocolError as ProtocolError,
-)
-from .exceptions import (
-    StreamClosedError as StreamClosedError,
-)
-from .utilities import (
-    HeaderValidationFlags as HeaderValidationFlags,
-)
-from .utilities import (
-    authority_from_headers as authority_from_headers,
-)
-from .utilities import (
-    extract_method_header as extract_method_header,
-)
-from .utilities import (
-    guard_increment_window as guard_increment_window,
-)
-from .utilities import (
-    is_informational_response as is_informational_response,
-)
-from .utilities import (
-    normalize_inbound_headers as normalize_inbound_headers,
-)
-from .utilities import (
-    normalize_outbound_headers as normalize_outbound_headers,
-)
-from .utilities import (
-    validate_headers as validate_headers,
-)
-from .utilities import (
-    validate_outbound_headers as validate_outbound_headers,
-)
+from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, DataReceived as DataReceived, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PushedStreamReceived as PushedStreamReceived, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, StreamEnded as StreamEnded, StreamReset as StreamReset, TrailersReceived as TrailersReceived, WindowUpdated as WindowUpdated
+from .exceptions import FlowControlError as FlowControlError, InvalidBodyLengthError as InvalidBodyLengthError, ProtocolError as ProtocolError, StreamClosedError as StreamClosedError
+from .utilities import HeaderValidationFlags as HeaderValidationFlags, authority_from_headers as authority_from_headers, extract_method_header as extract_method_header, guard_increment_window as guard_increment_window, is_informational_response as is_informational_response, normalize_inbound_headers as normalize_inbound_headers, normalize_outbound_headers as normalize_outbound_headers, utf8_encode_headers as utf8_encode_headers, validate_headers as validate_headers, validate_outbound_headers as validate_outbound_headers
 from .windows import WindowManager as WindowManager
+from _typeshed import Incomplete
+from collections.abc import Iterable
+from enum import Enum, IntEnum
+from hpack.hpack import Encoder as Encoder
+from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped
+from hyperframe.frame import AltSvcFrame, ContinuationFrame, Frame as Frame, HeadersFrame, PushPromiseFrame, RstStreamFrame
+from typing import Any
 
 class StreamState(IntEnum):
-    IDLE: int
-    RESERVED_REMOTE: int
-    RESERVED_LOCAL: int
-    OPEN: int
-    HALF_CLOSED_REMOTE: int
-    HALF_CLOSED_LOCAL: int
-    CLOSED: int
+    IDLE = 0
+    RESERVED_REMOTE = 1
+    RESERVED_LOCAL = 2
+    OPEN = 3
+    HALF_CLOSED_REMOTE = 4
+    HALF_CLOSED_LOCAL = 5
+    CLOSED = 6
 
 class StreamInputs(Enum):
-    SEND_HEADERS: int
-    SEND_PUSH_PROMISE: int
-    SEND_RST_STREAM: int
-    SEND_DATA: int
-    SEND_WINDOW_UPDATE: int
-    SEND_END_STREAM: int
-    RECV_HEADERS: int
-    RECV_PUSH_PROMISE: int
-    RECV_RST_STREAM: int
-    RECV_DATA: int
-    RECV_WINDOW_UPDATE: int
-    RECV_END_STREAM: int
-    RECV_CONTINUATION: int
-    SEND_INFORMATIONAL_HEADERS: int
-    RECV_INFORMATIONAL_HEADERS: int
-    SEND_ALTERNATIVE_SERVICE: int
-    RECV_ALTERNATIVE_SERVICE: int
-    UPGRADE_CLIENT: int
-    UPGRADE_SERVER: int
+    SEND_HEADERS = 0
+    SEND_PUSH_PROMISE = 1
+    SEND_RST_STREAM = 2
+    SEND_DATA = 3
+    SEND_WINDOW_UPDATE = 4
+    SEND_END_STREAM = 5
+    RECV_HEADERS = 6
+    RECV_PUSH_PROMISE = 7
+    RECV_RST_STREAM = 8
+    RECV_DATA = 9
+    RECV_WINDOW_UPDATE = 10
+    RECV_END_STREAM = 11
+    RECV_CONTINUATION = 12
+    SEND_INFORMATIONAL_HEADERS = 13
+    RECV_INFORMATIONAL_HEADERS = 14
+    SEND_ALTERNATIVE_SERVICE = 15
+    RECV_ALTERNATIVE_SERVICE = 16
+    UPGRADE_CLIENT = 17
+    UPGRADE_SERVER = 18
 
 class StreamClosedBy(Enum):
-    SEND_END_STREAM: int
-    RECV_END_STREAM: int
-    SEND_RST_STREAM: int
-    RECV_RST_STREAM: int
+    SEND_END_STREAM = 0
+    RECV_END_STREAM = 1
+    SEND_RST_STREAM = 2
+    RECV_RST_STREAM = 3
 
 STREAM_OPEN: Incomplete
 
@@ -121,32 +59,32 @@ class H2StreamStateMachine:
     headers_received: Incomplete
     trailers_received: Incomplete
     stream_closed_by: Incomplete
-    def __init__(self, stream_id) -> None: ...
-    def process_input(self, input_): ...
-    def request_sent(self, previous_state): ...
-    def response_sent(self, previous_state): ...
-    def request_received(self, previous_state): ...
-    def response_received(self, previous_state): ...
-    def data_received(self, previous_state): ...
-    def window_updated(self, previous_state): ...
-    def stream_half_closed(self, previous_state): ...
-    def stream_ended(self, previous_state): ...
-    def stream_reset(self, previous_state): ...
-    def send_new_pushed_stream(self, previous_state): ...
-    def recv_new_pushed_stream(self, previous_state): ...
-    def send_push_promise(self, previous_state): ...
-    def recv_push_promise(self, previous_state): ...
-    def send_end_stream(self, previous_state) -> None: ...
-    def send_reset_stream(self, previous_state) -> None: ...
-    def reset_stream_on_error(self, previous_state) -> None: ...
-    def recv_on_closed_stream(self, previous_state) -> None: ...
-    def send_on_closed_stream(self, previous_state) -> None: ...
-    def recv_push_on_closed_stream(self, previous_state) -> None: ...
-    def send_push_on_closed_stream(self, previous_state) -> None: ...
-    def send_informational_response(self, previous_state): ...
-    def recv_informational_response(self, previous_state): ...
-    def recv_alt_svc(self, previous_state): ...
-    def send_alt_svc(self, previous_state) -> None: ...
+    def __init__(self, stream_id: int) -> None: ...
+    def process_input(self, input_: StreamInputs) -> Any: ...
+    def request_sent(self, previous_state: StreamState) -> list[Event]: ...
+    def response_sent(self, previous_state: StreamState) -> list[Event]: ...
+    def request_received(self, previous_state: StreamState) -> list[Event]: ...
+    def response_received(self, previous_state: StreamState) -> list[Event]: ...
+    def data_received(self, previous_state: StreamState) -> list[Event]: ...
+    def window_updated(self, previous_state: StreamState) -> list[Event]: ...
+    def stream_half_closed(self, previous_state: StreamState) -> list[Event]: ...
+    def stream_ended(self, previous_state: StreamState) -> list[Event]: ...
+    def stream_reset(self, previous_state: StreamState) -> list[Event]: ...
+    def send_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ...
+    def send_push_promise(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_push_promise(self, previous_state: StreamState) -> list[Event]: ...
+    def send_end_stream(self, previous_state: StreamState) -> None: ...
+    def send_reset_stream(self, previous_state: StreamState) -> None: ...
+    def reset_stream_on_error(self, previous_state: StreamState) -> None: ...
+    def recv_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def send_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def recv_push_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def send_push_on_closed_stream(self, previous_state: StreamState) -> None: ...
+    def send_informational_response(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_informational_response(self, previous_state: StreamState) -> list[Event]: ...
+    def recv_alt_svc(self, previous_state: StreamState) -> list[Event]: ...
+    def send_alt_svc(self, previous_state: StreamState) -> None: ...
 
 class H2Stream:
     state_machine: Incomplete
@@ -155,30 +93,30 @@ class H2Stream:
     request_method: Incomplete
     outbound_flow_control_window: Incomplete
     config: Incomplete
-    def __init__(self, stream_id, config, inbound_window_size, outbound_window_size) -> None: ...
+    def __init__(self, stream_id: int, config: H2Configuration, inbound_window_size: int, outbound_window_size: int) -> None: ...
     @property
-    def inbound_flow_control_window(self): ...
+    def inbound_flow_control_window(self) -> int: ...
     @property
-    def open(self): ...
+    def open(self) -> bool: ...
     @property
-    def closed(self): ...
+    def closed(self) -> bool: ...
     @property
-    def closed_by(self): ...
-    def upgrade(self, client_side) -> None: ...
-    def send_headers(self, headers, encoder, end_stream: bool = ...): ...
-    def push_stream_in_band(self, related_stream_id, headers, encoder): ...
-    def locally_pushed(self): ...
-    def send_data(self, data, end_stream: bool = ..., pad_length: Incomplete | None = ...): ...
-    def end_stream(self): ...
-    def advertise_alternative_service(self, field_value): ...
-    def increase_flow_control_window(self, increment): ...
-    def receive_push_promise_in_band(self, promised_stream_id, headers, header_encoding): ...
-    def remotely_pushed(self, pushed_headers): ...
-    def receive_headers(self, headers, end_stream, header_encoding): ...
-    def receive_data(self, data, end_stream, flow_control_len): ...
-    def receive_window_update(self, increment): ...
+    def closed_by(self) -> StreamClosedBy | None: ...
+    def upgrade(self, client_side: bool) -> None: ...
+    def send_headers(self, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder, end_stream: bool = False) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ...
+    def push_stream_in_band(self, related_stream_id: int, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ...
+    def locally_pushed(self) -> list[Frame]: ...
+    def send_data(self, data: bytes | memoryview, end_stream: bool = False, pad_length: int | None = None) -> list[Frame]: ...
+    def end_stream(self) -> list[Frame]: ...
+    def advertise_alternative_service(self, field_value: bytes) -> list[Frame]: ...
+    def increase_flow_control_window(self, increment: int) -> list[Frame]: ...
+    def receive_push_promise_in_band(self, promised_stream_id: int, headers: Iterable[Header], header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ...
+    def remotely_pushed(self, pushed_headers: Iterable[Header]) -> tuple[list[Frame], list[Event]]: ...
+    def receive_headers(self, headers: Iterable[Header], end_stream: bool, header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ...
+    def receive_data(self, data: bytes, end_stream: bool, flow_control_len: int) -> tuple[list[Frame], list[Event]]: ...
+    def receive_window_update(self, increment: int) -> tuple[list[Frame], list[Event]]: ...
     def receive_continuation(self) -> None: ...
-    def receive_alt_svc(self, frame): ...
-    def reset_stream(self, error_code: int = ...): ...
-    def stream_reset(self, frame): ...
-    def acknowledge_received_data(self, acknowledged_size): ...
+    def receive_alt_svc(self, frame: AltSvcFrame) -> tuple[list[Frame], list[Event]]: ...
+    def reset_stream(self, error_code: ErrorCodes | int = 0) -> list[Frame]: ...
+    def stream_reset(self, frame: RstStreamFrame) -> tuple[list[Frame], list[Event]]: ...
+    def acknowledge_received_data(self, acknowledged_size: int) -> list[Frame]: ...
diff --git a/test_runner/stubs/h2/utilities.pyi b/test_runner/stubs/h2/utilities.pyi
index e0a8d55d1d..8802087e4c 100644
--- a/test_runner/stubs/h2/utilities.pyi
+++ b/test_runner/stubs/h2/utilities.pyi
@@ -1,25 +1,32 @@
-from typing import NamedTuple
-
+import collections
+from .exceptions import FlowControlError as FlowControlError, ProtocolError as ProtocolError
 from _typeshed import Incomplete
-
-from .exceptions import FlowControlError as FlowControlError
-from .exceptions import ProtocolError as ProtocolError
+from collections.abc import Generator, Iterable
+from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped
+from typing import Any, NamedTuple
 
 UPPER_RE: Incomplete
+SIGIL: Incomplete
+INFORMATIONAL_START: Incomplete
 CONNECTION_HEADERS: Incomplete
 
-def extract_method_header(headers): ...
-def is_informational_response(headers): ...
-def guard_increment_window(current, increment): ...
-def authority_from_headers(headers): ...
+def extract_method_header(headers: Iterable[Header]) -> bytes | None: ...
+def is_informational_response(headers: Iterable[Header]) -> bool: ...
+def guard_increment_window(current: int, increment: int) -> int: ...
+def authority_from_headers(headers: Iterable[Header]) -> bytes | None: ...
 
 class HeaderValidationFlags(NamedTuple):
-    is_client: Incomplete
-    is_trailer: Incomplete
-    is_response_header: Incomplete
-    is_push_promise: Incomplete
+    is_client: bool
+    is_trailer: bool
+    is_response_header: bool
+    is_push_promise: bool
 
-def validate_headers(headers, hdr_validation_flags): ...
-def normalize_outbound_headers(headers, hdr_validation_flags): ...
-def normalize_inbound_headers(headers, hdr_validation_flags): ...
-def validate_outbound_headers(headers, hdr_validation_flags): ...
+def validate_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Iterable[Header]: ...
+def utf8_encode_headers(headers: Iterable[HeaderWeaklyTyped]) -> list[Header]: ...
+def normalize_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags | None, should_split_outbound_cookies: bool = False) -> Generator[Header, None, None]: ...
+def normalize_inbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ...
+def validate_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ...
+
+class SizeLimitDict(collections.OrderedDict[int, Any]):
+    def __init__(self, *args: dict[int, int], **kwargs: Any) -> None: ...
+    def __setitem__(self, key: int, value: Any | int) -> None: ...
diff --git a/test_runner/stubs/h2/windows.pyi b/test_runner/stubs/h2/windows.pyi
index 7dc78e431c..b132ee610c 100644
--- a/test_runner/stubs/h2/windows.pyi
+++ b/test_runner/stubs/h2/windows.pyi
@@ -1,13 +1,12 @@
-from _typeshed import Incomplete
-
 from .exceptions import FlowControlError as FlowControlError
+from _typeshed import Incomplete
 
 LARGEST_FLOW_CONTROL_WINDOW: Incomplete
 
 class WindowManager:
     max_window_size: Incomplete
     current_window_size: Incomplete
-    def __init__(self, max_window_size) -> None: ...
-    def window_consumed(self, size) -> None: ...
-    def window_opened(self, size) -> None: ...
-    def process_bytes(self, size): ...
+    def __init__(self, max_window_size: int) -> None: ...
+    def window_consumed(self, size: int) -> None: ...
+    def window_opened(self, size: int) -> None: ...
+    def process_bytes(self, size: int) -> int | None: ...

From 9f1408fdf3c4cc79ad3e1810c4e4c76a80a695a9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 24 Jan 2025 16:57:32 +0200
Subject: [PATCH 09/72] Do not assign max(lsn) to maxLastWrittenLsn in
 SetLastWrittenLSNForblokv (#10474)

## Problem

See https://github.com/neondatabase/neon/issues/10281

`SetLastWrittenLSNForBlockv` is assigning max(lsn) to
`maxLastWrittenLsn` while its should contain only max LSN not present in
LwLSN cache. It case unnecessary waits in PS.

## Summary of changes

Restore status-quo for pg17.

Related Postgres PR: https://github.com/neondatabase/postgres/pull/563

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 4276717f6e..46f9b96555 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 4276717f6e91023e504de355f4f21d4824074de8
+Subproject commit 46f9b96555e084c35dd975da9485996db9e86181
diff --git a/vendor/revisions.json b/vendor/revisions.json
index dba0e67fb4..3aa42d22c5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "4276717f6e91023e504de355f4f21d4824074de8"
+    "46f9b96555e084c35dd975da9485996db9e86181"
   ],
   "v16": [
     "16.6",

From be718ed12194421b9867f1d8357fd60b6cec2da6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sat, 25 Jan 2025 17:51:54 +0100
Subject: [PATCH 10/72] pageserver: disable L0 flush stalls, tune delay
 threshold (#10507)

## Problem

In ingest benchmarks, we see L0 compaction delays of over 10 minutes due
to image compaction. We can't stall L0 flushes for that long.

## Summary of changes

Disable L0 flush stalls, and bump the default L0 flush delay threshold
from 20 to 30 L0 layers.
---
 pageserver/src/tenant/timeline.rs | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ee43512501..07689b5e76 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2168,8 +2168,8 @@ impl Timeline {
     }
 
     fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
-        // Default to delay L0 flushes at 2x compaction threshold.
-        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 2;
+        // Default to delay L0 flushes at 3x compaction threshold.
+        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3;
 
         // If compaction is disabled, don't delay.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2197,8 +2197,10 @@ impl Timeline {
     }
 
     fn get_l0_flush_stall_threshold(&self) -> Option<usize> {
-        // Default to stall L0 flushes at 4x compaction threshold.
-        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 4;
+        // Default to stall L0 flushes at 5x compaction threshold.
+        // TODO: stalls are temporarily disabled by default, see below.
+        #[allow(unused)]
+        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 5;
 
         // If compaction is disabled, don't stall.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2230,8 +2232,13 @@ impl Timeline {
             return None;
         }
 
-        let l0_flush_stall_threshold = l0_flush_stall_threshold
-            .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
+        // Disable stalls by default. In ingest benchmarks, we see image compaction take >10
+        // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long.
+        //
+        // TODO: fix this.
+        // let l0_flush_stall_threshold = l0_flush_stall_threshold
+        //    .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
+        let l0_flush_stall_threshold = l0_flush_stall_threshold?;
 
         // 0 disables backpressure.
         if l0_flush_stall_threshold == 0 {

From 4dd4096f11b4dd214b143c1a4bda739a37a9abdb Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Mon, 27 Jan 2025 14:09:21 +0000
Subject: [PATCH 11/72] Pgbouncer exporter in compute image (#10503)

https://github.com/neondatabase/cloud/issues/19081
Include pgbouncer_exporter in compute image and run it at port 9127
---
 compute/compute-node.Dockerfile     | 4 +++-
 compute/etc/pgbouncer.ini           | 2 ++
 compute/vm-image-spec-bookworm.yaml | 4 ++++
 compute/vm-image-spec-bullseye.yaml | 4 ++++
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a80c701b45..539135470e 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1266,11 +1266,12 @@ RUN set -e \
 
 #########################################################################################
 #
-# Layers "postgres-exporter" and "sql-exporter"
+# Layers "postgres-exporter", "pgbouncer-exporter", and "sql-exporter"
 #
 #########################################################################################
 
 FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
+FROM quay.io/prometheuscommunity/pgbouncer-exporter:v0.10.2 AS pgbouncer-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
@@ -1402,6 +1403,7 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 
 # Metrics exporter binaries and  configuration files
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+COPY --from=pgbouncer-exporter /bin/pgbouncer_exporter /bin/pgbouncer_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
 
 COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
index 604b4e41ea..9d68cbb8d5 100644
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -19,6 +19,8 @@ max_prepared_statements=0
 admin_users=postgres
 unix_socket_dir=/tmp/
 unix_socket_mode=0777
+; required for pgbouncer_exporter
+ignore_startup_parameters=extra_float_digits
 
 ;; Disable connection logging. It produces a lot of logs that no one looks at,
 ;; and we can get similar log entries from the proxy too. We had incidents in
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index ac9f5c6904..005143fff3 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -27,6 +27,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+  - name: pgbouncer-exporter
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 0d178e1c24..2fe50c3a45 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -27,6 +27,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+  - name: pgbouncer-exporter
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn

From b0b4b7dd8f0dc6a6e780b7ed880b26f70d9016a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:25:11 +0100
Subject: [PATCH 12/72] storcon: switch to diesel-async and tokio-postgres
 (#10280)

Switches the storcon away from using diesel's synchronous APIs in favour
of `diesel-async`.

Advantages:

* less C dependencies, especially no openssl, which might be behind the
bug: https://github.com/neondatabase/cloud/issues/21010
* Better to only have async than mix of async plus `spawn_blocking`

We had to turn off usage of the connection pool for migrations, as
diesel migrations don't support async APIs. Thus we still use
`spawn_blocking` in that one place. But this is explicitly done in one
of the `diesel-async` examples.
---
 Cargo.lock                            | 159 ++++--
 Makefile                              |   2 -
 storage_controller/Cargo.toml         |   5 +-
 storage_controller/src/main.rs        |   2 +-
 storage_controller/src/persistence.rs | 783 +++++++++++++++-----------
 5 files changed, 561 insertions(+), 390 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1f090a27e4..a201a6abae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -942,6 +942,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
+[[package]]
+name = "bb8"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+ "parking_lot 0.12.1",
+ "tokio",
+]
+
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1301,7 +1313,7 @@ dependencies = [
  "tar",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-util",
  "tower 0.5.2",
@@ -1410,7 +1422,7 @@ dependencies = [
  "storage_broker",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-util",
  "toml",
  "toml_edit",
@@ -1786,11 +1798,24 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
- "pq-sys",
- "r2d2",
  "serde_json",
 ]
 
+[[package]]
+name = "diesel-async"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
+dependencies = [
+ "async-trait",
+ "bb8",
+ "diesel",
+ "futures-util",
+ "scoped-futures",
+ "tokio",
+ "tokio-postgres 0.7.12",
+]
+
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4042,8 +4067,8 @@ dependencies = [
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.4",
+ "postgres-types 0.2.4",
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
@@ -4074,7 +4099,7 @@ dependencies = [
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -4132,7 +4157,7 @@ dependencies = [
  "serde",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-util",
  "utils",
@@ -4438,7 +4463,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 ]
 
 [[package]]
@@ -4459,6 +4484,24 @@ dependencies = [
  "stringprep",
 ]
 
+[[package]]
+name = "postgres-protocol"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
+dependencies = [
+ "base64 0.22.1",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "hmac",
+ "md-5",
+ "memchr",
+ "rand 0.8.5",
+ "sha2",
+ "stringprep",
+]
+
 [[package]]
 name = "postgres-protocol2"
 version = "0.1.0"
@@ -4482,7 +4525,18 @@ source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f
 dependencies = [
  "bytes",
  "fallible-iterator",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
+]
+
+[[package]]
+name = "postgres-types"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol 0.6.7",
 ]
 
 [[package]]
@@ -4507,7 +4561,7 @@ dependencies = [
  "serde",
  "thiserror",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-postgres-rustls",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -4522,7 +4576,7 @@ dependencies = [
  "itertools 0.10.5",
  "once_cell",
  "postgres",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "url",
 ]
 
@@ -4609,15 +4663,6 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
-[[package]]
-name = "pq-sys"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
-dependencies = [
- "vcpkg",
-]
-
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4625,7 +4670,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
  "rand 0.8.5",
  "serde",
  "thiserror",
@@ -4873,7 +4918,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite 0.21.0",
@@ -4930,17 +4975,6 @@ dependencies = [
  "proc-macro2",
 ]
 
-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5672,7 +5706,7 @@ dependencies = [
  "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
  "postgres_backend",
  "postgres_ffi",
  "pprof",
@@ -5696,7 +5730,7 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -5755,12 +5789,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
+name = "scoped-futures"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
 dependencies = [
- "parking_lot 0.12.1",
+ "pin-project-lite",
 ]
 
 [[package]]
@@ -6295,6 +6329,7 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
+ "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6309,10 +6344,10 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
- "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
+ "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6365,7 +6400,7 @@ dependencies = [
  "serde_json",
  "storage_controller_client",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-postgres-rustls",
  "tokio-stream",
  "tokio-util",
@@ -6824,13 +6859,39 @@ dependencies = [
  "percent-encoding",
  "phf",
  "pin-project-lite",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.4",
+ "postgres-types 0.2.4",
  "socket2",
  "tokio",
  "tokio-util",
 ]
 
+[[package]]
+name = "tokio-postgres"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "futures-channel",
+ "futures-util",
+ "log",
+ "parking_lot 0.12.1",
+ "percent-encoding",
+ "phf",
+ "pin-project-lite",
+ "postgres-protocol 0.6.7",
+ "postgres-types 0.2.8",
+ "rand 0.8.5",
+ "socket2",
+ "tokio",
+ "tokio-util",
+ "whoami",
+]
+
 [[package]]
 name = "tokio-postgres-rustls"
 version = "0.12.0"
@@ -6840,7 +6901,7 @@ dependencies = [
  "ring",
  "rustls 0.23.18",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
@@ -7498,12 +7559,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -7523,7 +7578,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
diff --git a/Makefile b/Makefile
index 22ebfea7d5..d1238caebf 100644
--- a/Makefile
+++ b/Makefile
@@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index caaa22d0a5..9860bd5d0e 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -45,12 +45,11 @@ strum_macros.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
-    "postgres",
-    "r2d2",
     "chrono",
 ] }
+diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-r2d2 = { version = "0.8.10" }
+scoped-futures = "0.1.4"
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 801409d612..659c088d51 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url));
+    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 37bfaf1139..35eb15b297 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,9 +5,12 @@ use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
-use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel::Connection;
+use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
+use diesel_async::pooled_connection::bb8::Pool;
+use diesel_async::pooled_connection::AsyncDieselConnectionManager;
+use diesel_async::RunQueryDsl;
+use diesel_async::{AsyncConnection, AsyncPgConnection};
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -20,6 +23,7 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
+use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -60,7 +64,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
+    connection_pool: Pool<AsyncPgConnection>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -76,7 +80,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] r2d2::Error),
+    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -124,6 +128,7 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
+#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -136,6 +141,11 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
+// A generous allowance for how many times we may retry serializable transactions
+// before giving up.  This is not expected to be hit: it is a defensive measure in case we
+// somehow engineer a situation where duelling transactions might otherwise live-lock.
+const MAX_RETRIES: usize = 128;
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -145,12 +155,12 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub fn new(database_url: String) -> Self {
-        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
+    pub async fn new(database_url: String) -> Self {
+        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new(database_url);
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = diesel::r2d2::Pool::builder()
+        let connection_pool = Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -158,6 +168,7 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
+            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -171,7 +182,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match PgConnection::establish(database_url) {
+            match AsyncPgConnection::establish(database_url).await {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -192,57 +203,22 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            HarnessWithOutput::write_to_stdout(conn)
-                .run_pending_migrations(MIGRATIONS)
-                .map(|_| ())
-                .map_err(|e| DatabaseError::Migration(e.to_string()))
-        })
-        .await
-    }
-
-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
-    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
-    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        // A generous allowance for how many times we may retry serializable transactions
-        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
-        // somehow engineer a situation where duelling transactions might otherwise live-lock.
-        const MAX_RETRIES: usize = 128;
-
-        let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+        // Can't use self.with_conn here as we do spawn_blocking which requires static.
+        let conn = self
+            .connection_pool
+            .dedicated_connection()
+            .await
+            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
+        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
+            AsyncConnectionWrapper::from(conn);
+        tokio::task::spawn_blocking(move || {
             let mut retry_count = 0;
             loop {
-                match conn.build_transaction().serializable().run(|c| func(c)) {
+                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
+                    .run_pending_migrations(MIGRATIONS)
+                    .map(|_| ())
+                    .map_err(|e| DatabaseError::Migration(e.to_string()));
+                match result {
                     Ok(r) => break Ok(r),
                     Err(
                         err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
@@ -271,33 +247,112 @@ impl Persistence {
             }
         })
         .await
-        .expect("Task panic")
+        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
+        Ok(())
+    }
+
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<'a, 'b, F, R>(
+        &self,
+        op: DatabaseOperation,
+        func: F,
+    ) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
+    /// Call the provided function with a Diesel database connection in a retry loop
+    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
+    where
+        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
+            + Send
+            + std::marker::Sync
+            + 'a,
+        R: Send + 'b,
+    {
+        let mut retry_count = 0;
+        loop {
+            let mut conn = self.connection_pool.get().await?;
+            match conn
+                .build_transaction()
+                .serializable()
+                .run(|c| func(c))
+                .await
+            {
+                Ok(r) => break Ok(r),
+                Err(
+                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        diesel::result::DatabaseErrorKind::SerializationFailure,
+                        _,
+                    )),
+                ) => {
+                    retry_count += 1;
+                    if retry_count > MAX_RETRIES {
+                        tracing::error!(
+                            "Exceeded max retries on SerializationFailure errors: {err:?}"
+                        );
+                        break Err(err);
+                    } else {
+                        // Retry on serialization errors: these are expected, because even though our
+                        // transactions don't fight for the same rows, they will occasionally collide
+                        // on index pages (e.g. increment_generation for unrelated shards can collide)
+                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
+                        continue;
+                    }
+                }
+                Err(e) => break Err(e),
+            }
+        }
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
+        let np = &node.to_persistent();
+        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
+                    .values(np)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::nodes::table
+                        .load::<NodePersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -313,11 +368,14 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                    .execute(conn)?;
-                Ok(updated)
+                Box::pin(async move {
+                    let updated = diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                        .execute(conn)
+                        .await?;
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -339,17 +397,16 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::ListTenantShards,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -359,15 +416,14 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::LoadTenant,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
+            Box::pin(async move {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn)?;
+                let result = query.load::<TenantShardPersistence>(conn).await?;
 
                 Ok(result)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -393,19 +449,22 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
+        let shards = &shards;
+        let metadata_health_records = &metadata_health_records;
+        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(tenant_shards::table)
-                    .values(&shards)
-                    .execute(conn)?;
+                    .values(shards)
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(&metadata_health_records)
-                    .execute(conn)?;
+                    .values(metadata_health_records)
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -413,31 +472,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
+            Box::pin(async move {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
+            Box::pin(async move {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -454,34 +513,41 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .set(generation.eq(generation + 1))
-                    .execute(conn)?;
+                Box::pin(async move {
+                    let rows_updated = diesel::update(tenant_shards)
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .set(generation.eq(generation + 1))
+                        .execute(conn)
+                        .await?;
 
-                tracing::info!("Incremented {} tenants' generations", rows_updated);
+                    tracing::info!("Incremented {} tenants' generations", rows_updated);
 
-                // TODO: UPDATE+SELECT in one query
+                    // TODO: UPDATE+SELECT in one query
 
-                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                    .select(TenantShardPersistence::as_select())
-                    .load(conn)?;
+                    let updated = tenant_shards
+                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                        .select(TenantShardPersistence::as_select())
+                        .load(conn)
+                        .await?;
 
-                // If the node went through a drain and restart phase before re-attaching,
-                // then reset it's node scheduling policy to active.
-                diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .filter(
-                        scheduling_policy
-                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
-                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
-                    )
-                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                    .execute(conn)?;
+                    // If the node went through a drain and restart phase before re-attaching,
+                    // then reset it's node scheduling policy to active.
+                    diesel::update(nodes)
+                        .filter(node_id.eq(input_node_id.0 as i64))
+                        .filter(
+                            scheduling_policy
+                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
+                                .or(scheduling_policy
+                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
+                        )
+                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                        .execute(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -518,19 +584,22 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation.eq(generation + 1),
-                        generation_pageserver.eq(node_id.0 as i64),
-                    ))
-                    // TODO: only returning() the generation column
-                    .returning(TenantShardPersistence::as_returning())
-                    .get_result(conn)?;
+                Box::pin(async move {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set((
+                            generation.eq(generation + 1),
+                            generation_pageserver.eq(node_id.0 as i64),
+                        ))
+                        // TODO: only returning() the generation column
+                        .returning(TenantShardPersistence::as_returning())
+                        .get_result(conn)
+                        .await?;
 
-                Ok(updated)
+                    Ok(updated)
+                })
             })
             .await?;
 
@@ -562,12 +631,15 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                let result = tenant_shards
-                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                    .select(TenantShardPersistence::as_select())
-                    .order(shard_number)
-                    .load(conn)?;
-                Ok(result)
+                Box::pin(async move {
+                    let result = tenant_shards
+                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                        .select(TenantShardPersistence::as_select())
+                        .order(shard_number)
+                        .load(conn)
+                        .await?;
+                    Ok(result)
+                })
             })
             .await?;
 
@@ -615,15 +687,18 @@ impl Persistence {
                 break;
             }
 
+            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                    ).load(conn)?;
+                    Box::pin(async move {
+                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                        ).load(conn).await?;
 
-                    Ok(result)
+                        Ok(result)
+                    })
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -657,51 +732,58 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let tenant = &tenant;
+        let input_placement_policy = &input_placement_policy;
+        let input_config = &input_config;
+        let input_generation = &input_generation;
+        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+            Box::pin(async move {
+                let query = match tenant {
+                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .into_boxed(),
+                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(input_tenant_id.to_string()))
+                        .into_boxed(),
+                };
 
-            // Clear generation_pageserver if we are moving into a state where we won't have
-            // any attached pageservers.
-            let input_generation_pageserver = match input_placement_policy {
-                None | Some(PlacementPolicy::Attached(_)) => None,
-                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-            };
+                // Clear generation_pageserver if we are moving into a state where we won't have
+                // any attached pageservers.
+                let input_generation_pageserver = match input_placement_policy {
+                    None | Some(PlacementPolicy::Attached(_)) => None,
+                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+                };
 
-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
-                generation_pageserver: Option<Option<i64>>,
-            }
+                #[derive(AsChangeset)]
+                #[diesel(table_name = crate::schema::tenant_shards)]
+                struct ShardUpdate {
+                    generation: Option<i32>,
+                    placement_policy: Option<String>,
+                    config: Option<String>,
+                    scheduling_policy: Option<String>,
+                    generation_pageserver: Option<Option<i64>>,
+                }
 
-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .as_ref()
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config
-                    .as_ref()
-                    .map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                generation_pageserver: input_generation_pageserver,
-            };
+                let update = ShardUpdate {
+                    generation: input_generation.map(|g| g.into().unwrap() as i32),
+                    placement_policy: input_placement_policy
+                        .as_ref()
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    config: input_config
+                        .as_ref()
+                        .map(|c| serde_json::to_string(&c).unwrap()),
+                    scheduling_policy: input_scheduling_policy
+                        .map(|p| serde_json::to_string(&p).unwrap()),
+                    generation_pageserver: input_generation_pageserver,
+                };
 
-            query.set(update).execute(conn)?;
+                query.set(update).execute(conn).await?;
 
-            Ok(())
+                Ok(())
+            })
         })
         .await?;
 
@@ -715,23 +797,27 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
+        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            let mut shards_updated = Vec::default();
+            Box::pin(async move {
+                let mut shards_updated = Vec::default();
 
-            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                    .execute(conn)?;
+                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                        .execute(conn)
+                        .await?;
 
-                if updated == 1 {
-                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    if updated == 1 {
+                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                    }
                 }
-            }
 
-            Ok(shards_updated)
+                Ok(shards_updated)
+            })
         })
         .await
     }
@@ -739,17 +825,21 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            let updated = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                .set((
-                    generation_pageserver.eq(Option::<i64>::None),
-                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                ))
-                .execute(conn)?;
+            Box::pin(async move {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation_pageserver.eq(Option::<i64>::None),
+                        placement_policy
+                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                    ))
+                    .execute(conn)
+                    .await?;
 
-            Ok(updated)
+                Ok(updated)
+            })
         })
         .await?;
 
@@ -768,14 +858,16 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        let parent_to_children = parent_to_children.as_slice();
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
+            Box::pin(async move {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn)?;
+                .execute(conn).await?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -788,7 +880,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.clone();
+            let parent_to_children = parent_to_children.to_vec();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -796,7 +888,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn)?;
+                    .load::<TenantShardPersistence>(conn).await?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -811,12 +903,13 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn)?;
+                        .execute(conn).await?;
                 }
             }
 
             Ok(())
         })
+        })
         .await
     }
 
@@ -828,25 +921,26 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
+            Box::pin(async move {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -858,15 +952,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
+            Box::pin(async move {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -886,11 +980,12 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -906,25 +1001,28 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::UpdateMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
+        let healthy_records = healthy_records.as_slice();
+        let unhealthy_records = unhealthy_records.as_slice();
+        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
+            Box::pin(async move {
                 diesel::insert_into(metadata_health)
-                    .values(&healthy_records)
+                    .values(healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
 
                 diesel::insert_into(metadata_health)
-                    .values(&unhealthy_records)
+                    .values(unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)?;
+                    .execute(conn)
+                    .await?;
                 Ok(())
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -933,15 +1031,13 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealth,
-            move |conn| -> DatabaseResult<_> {
-                Ok(
-                    crate::schema::metadata_health::table
-                        .load::<MetadataHealthPersistence>(conn)?,
-                )
-            },
-        )
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
+            Box::pin(async {
+                Ok(crate::schema::metadata_health::table
+                    .load::<MetadataHealthPersistence>(conn)
+                    .await?)
+            })
+        })
         .await
     }
 
@@ -953,10 +1049,15 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::metadata_health::table
-                    .filter(healthy.eq(false))
-                    .load::<MetadataHealthPersistence>(conn)?)
+            move |conn| {
+                Box::pin(async {
+                    DatabaseResult::Ok(
+                        crate::schema::metadata_health::table
+                            .filter(healthy.eq(false))
+                            .load::<MetadataHealthPersistence>(conn)
+                            .await?,
+                    )
+                })
             },
         )
         .await
@@ -970,15 +1071,14 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(
-            DatabaseOperation::ListMetadataHealthOutdated,
-            move |conn| -> DatabaseResult<_> {
+        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
+            Box::pin(async move {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn)?;
+                let res = query.load::<MetadataHealthPersistence>(conn).await?;
 
                 Ok(res)
-            },
-        )
+            })
+        })
         .await
     }
 
@@ -986,12 +1086,13 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::GetLeader,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::controllers::table
+                        .load::<ControllerPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         if leader.len() > 1 {
@@ -1014,26 +1115,33 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(
-                DatabaseOperation::UpdateLeader,
-                move |conn| -> DatabaseResult<usize> {
+            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
+                let prev = prev.clone();
+                let new = new.clone();
+                Box::pin(async move {
                     let updated = match &prev {
-                        Some(prev) => diesel::update(controllers)
-                            .filter(address.eq(prev.address.clone()))
-                            .filter(started_at.eq(prev.started_at))
-                            .set((
-                                address.eq(new.address.clone()),
-                                started_at.eq(new.started_at),
-                            ))
-                            .execute(conn)?,
-                        None => diesel::insert_into(controllers)
-                            .values(new.clone())
-                            .execute(conn)?,
+                        Some(prev) => {
+                            diesel::update(controllers)
+                                .filter(address.eq(prev.address.clone()))
+                                .filter(started_at.eq(prev.started_at))
+                                .set((
+                                    address.eq(new.address.clone()),
+                                    started_at.eq(new.started_at),
+                                ))
+                                .execute(conn)
+                                .await?
+                        }
+                        None => {
+                            diesel::insert_into(controllers)
+                                .values(new.clone())
+                                .execute(conn)
+                                .await?
+                        }
                     };
 
                     Ok(updated)
-                },
-            )
+                })
+            })
             .await?;
 
         if updated == 0 {
@@ -1048,12 +1156,13 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
-                },
-            )
+            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
+                Box::pin(async move {
+                    Ok(crate::schema::safekeepers::table
+                        .load::<SafekeeperPersistence>(conn)
+                        .await?)
+                })
+            })
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1066,11 +1175,14 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
-            Ok(safekeepers
-                .filter(id_column.eq(&id))
-                .select(SafekeeperPersistence::as_select())
-                .get_result(conn)?)
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                Ok(safekeepers
+                    .filter(id_column.eq(&id))
+                    .select(SafekeeperPersistence::as_select())
+                    .get_result(conn)
+                    .await?)
+            })
         })
         .await
     }
@@ -1081,26 +1193,30 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            let bind = record
-                .as_insert_or_update()
-                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
+        self.with_conn(move |conn| {
+            let record = record.clone();
+            Box::pin(async move {
+                let bind = record
+                    .as_insert_or_update()
+                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
-            let inserted_updated = diesel::insert_into(safekeepers)
-                .values(&bind)
-                .on_conflict(id)
-                .do_update()
-                .set(&bind)
-                .execute(conn)?;
+                let inserted_updated = diesel::insert_into(safekeepers)
+                    .values(&bind)
+                    .on_conflict(id)
+                    .do_update()
+                    .set(&bind)
+                    .execute(conn)
+                    .await?;
 
-            if inserted_updated != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({})",
-                    inserted_updated
-                )));
-            }
+                if inserted_updated != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({})",
+                        inserted_updated
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }
@@ -1112,26 +1228,29 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            #[derive(Insertable, AsChangeset)]
-            #[diesel(table_name = crate::schema::safekeepers)]
-            struct UpdateSkSchedulingPolicy<'a> {
-                id: i64,
-                scheduling_policy: &'a str,
-            }
-            let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                #[derive(Insertable, AsChangeset)]
+                #[diesel(table_name = crate::schema::safekeepers)]
+                struct UpdateSkSchedulingPolicy<'a> {
+                    id: i64,
+                    scheduling_policy: &'a str,
+                }
+                let scheduling_policy_ = String::from(scheduling_policy_);
 
-            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                .set(scheduling_policy.eq(scheduling_policy_))
-                .execute(conn)?;
+                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                    .set(scheduling_policy.eq(scheduling_policy_))
+                    .execute(conn)
+                    .await?;
 
-            if rows_affected != 1 {
-                return Err(DatabaseError::Logical(format!(
-                    "unexpected number of rows ({rows_affected})",
-                )));
-            }
+                if rows_affected != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({rows_affected})",
+                    )));
+                }
 
-            Ok(())
+                Ok(())
+            })
         })
         .await
     }

From aec92bfc34a2e0bf86a47b5664746dc67a82dcf3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 17:03:32 +0000
Subject: [PATCH 13/72] pageserver: decrease utilization MAX_SHARDS (#10489)

## Problem

The intent of this parameter is to have pageservers consider themselves
"full" if they've got lots of shards, even if they have plenty of
capacity. It works, but because we typically successfully oversubscribe
capacity up to 200%, the MAX_SHARDS limit is effectively doubled, so
this 20,000 value ends up meaning 40,000, whereas the original intent
was to limit nodes to ~10000 shards.

## Summary of changes

- Change MAX_SHARDS to 5000, so that a node with 5000 will get a 100%
utilization, which is equivalent in practice to being considered "half
full" by the storage controller in capacity terms.

This is all a bit subtle and indiret. Originally the limit was baked
into the pageserver with the idea that the pageserver knows better what
its own resources tolerate than the storage controller does, but in
practice it would be probably be easier to understand all this if we
just did it controller-side. So there's scope to refactor here in
future.
---
 pageserver/src/utilization.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index a0223f3bce..093a944777 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -49,7 +49,7 @@ pub(crate) fn regenerate(
     };
 
     // Express a static value for how many shards we may schedule on one node
-    const MAX_SHARDS: u32 = 20000;
+    const MAX_SHARDS: u32 = 5000;
 
     let mut doc = PageserverUtilization {
         disk_usage_bytes: used,

From aabf455dfbeb3efa00673c069249aaa90004f793 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 17:24:42 +0000
Subject: [PATCH 14/72] README: clarify that neon_local is a dev/test tool
 (#10512)

## Problem

From time to time, folks discover our `control_plane/` folder and make
the (reasonable) mistake of thinking it's a tool for running full-sized
Neon systems, whereas in reality it is a tool for dev/test.

## Summary of changes

- Change control_plane's readme title to "Local Development Control
Plane (`neon_local`)`
- Change "Running local installation" to "Running a local development
environment" in the main readme
---
 README.md               | 6 ++++--
 control_plane/README.md | 8 ++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1417d6b9e7..4453904346 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,10 @@ The Neon storage engine consists of two major components:
 
 See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.
 
-## Running local installation
+## Running a local development environment
 
+Neon can be run on a workstation for small experiments and to test code changes, by
+following these instructions.
 
 #### Installing dependencies on Linux
 1. Install build dependencies and other applicable packages
@@ -238,7 +240,7 @@ postgres=# select * from t;
 > cargo neon stop
 ```
 
-More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
+More advanced usages can be found at [Local Development Control Plane (`neon_local`))](./control_plane/README.md).
 
 #### Handling build failures
 
diff --git a/control_plane/README.md b/control_plane/README.md
index 827aba5c1f..aa6f935e27 100644
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -1,6 +1,10 @@
-# Control Plane and Neon Local
+# Local Development Control Plane (`neon_local`)
 
-This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
+This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.  This is a convenience to invoke
+the `neon_local` binary.
+
+**Note**: this is a dev/test tool -- a minimal control plane suitable for testing
+code changes locally, but not suitable for running production systems.
 
 ## Example: Start with Postgres 16
 

From ebf44210ba2fdd66de25f2689182dcb1c4a6d3f7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 17:44:18 +0000
Subject: [PATCH 15/72] remote_storage: less sensitive timeout logging in ABS
 listings (#10518)

## Problem

We were logging a warning after a single request timeout, while listing
objects.

Closes: https://github.com/neondatabase/neon/issues/10166

## Summary of changes

- These timeouts are a pretty normal part of life, so back it off to
only log a warning after two in a row.
---
 libs/remote_storage/src/azure_blob.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index c89f50ef2b..9027a8bf55 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -377,7 +377,8 @@ impl RemoteStorage for AzureBlobStorage {
 
                 let next_item = next_item?;
 
-                if timeout_try_cnt >= 2 {
+                // Log a warning if we saw two timeouts in a row before a successful request
+                if timeout_try_cnt > 2 {
                     tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt);
                 }
                 timeout_try_cnt = 1;

From 3d36dfe5337504d108ecb475ab4f983ccbde77cc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 27 Jan 2025 20:19:55 +0100
Subject: [PATCH 16/72] fix: noisy `broker subscription failed` error during
 storage broker deploys (#10521)

During broker deploys, pageservers log this noisy WARN en masse.

I can trivially reproduce the WARN message in neon_local by SIGKILLing
broker during e.g. `pgbench -i`.

I don't understand why tonic is not detecting the error as
`Code::Unavailable`.

Until we find time to understand that / fix upstream, this PR adds the
error message to the existing list of known error messages that get
demoted to INFO level.

Refs:
-  refs https://github.com/neondatabase/neon/issues/9562
---
 .../src/tenant/timeline/walreceiver/connection_manager.rs      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 583d6309ab..d8ea32cd45 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -164,9 +164,10 @@ pub(super) async fn connection_manager_loop_step(
                     Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                     Err(status) => {
                         match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
                                 // tonic's error handling doesn't provide a clear code for disconnections: we get
                                 // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
+                                // => https://github.com/neondatabase/neon/issues/9562
                                 info!("broker disconnected: {status}");
                             },
                             _ => {

From eb9832d8460a80d9111b20124863444866d93d14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 27 Jan 2025 20:38:18 +0100
Subject: [PATCH 17/72] Remove PQ_LIB_DIR env var (#10526)

We now don't need libpq any more for the build of the storage
controller, as we use `diesel-async` since #10280. Therefore, we remove
the env var that gave cargo/rustc the location for libpq.

Follow-up of #10280
---
 .github/workflows/_build-and-test-locally.yml | 4 ----
 .github/workflows/build-macos.yml             | 2 +-
 .github/workflows/neon_extra_builds.yml       | 2 +-
 Dockerfile                                    | 2 +-
 4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 2daed90386..f97402a90b 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,8 +158,6 @@ jobs:
 
       - name: Run cargo build
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -217,8 +215,6 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 01d82a1ed2..347a511e98 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5b5910badf..f077e04d1c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Dockerfile b/Dockerfile
index 2e4f8e5546..a8f7ae0a62 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \

From 5477d7db93f47112e0aa3d6625997c06db99d23c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 27 Jan 2025 11:47:49 -0800
Subject: [PATCH 18/72] fast_import: fixes for Postgres v17 (#10414)

Now that the tests are run on v17, they're also run in debug mode, which
is slow. Increase statement_timeout in the test to work around that.
---
 pageserver/src/tenant/timeline/import_pgdata.rs  |  2 +-
 .../import_pgdata/importbucket_client.rs         |  2 +-
 test_runner/regress/test_import_pgdata.py        | 16 ++++------------
 3 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs
index de56468580..6940179ae9 100644
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -113,7 +113,7 @@ pub async fn doit(
             match res {
                 Ok(_) => break,
                 Err(err) => {
-                    info!(?err, "indefintely waiting for pgdata to finish");
+                    info!(?err, "indefinitely waiting for pgdata to finish");
                     if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
                         .await
                         .is_ok()
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
index bc4d148a29..68937e535d 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -308,7 +308,7 @@ impl ControlFile {
             202107181 => 14,
             202209061 => 15,
             202307071 => 16,
-            /* XXX pg17 */
+            202406281 => 17,
             catversion => {
                 anyhow::bail!("unrecognized catalog version {catversion}")
             }
diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index d02a9d19db..182f715b0e 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -14,10 +14,8 @@ from fixtures.pageserver.http import (
     ImportPgdataIdemptencyKey,
     PageserverApiException,
 )
-from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import run_only_on_postgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -39,10 +37,6 @@ smoke_params = [
 ]
 
 
-@run_only_on_postgres(
-    [PgVersion.V14, PgVersion.V15, PgVersion.V16],
-    "newer control file catalog version and struct format isn't supported",
-)
 @pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params)
 def test_pgdata_import_smoke(
     vanilla_pg: VanillaPostgres,
@@ -117,13 +111,15 @@ def test_pgdata_import_smoke(
         # TODO: would be nicer to just compare pgdump
 
         # Enable IO concurrency for batching on large sequential scan, to avoid making
-        # this test unnecessarily onerous on CPU
+        # this test unnecessarily onerous on CPU. Especially on debug mode, it's still
+        # pretty onerous though, so increase statement_timeout to avoid timeouts.
         assert ep.safe_psql_many(
             [
                 "set effective_io_concurrency=32;",
+                "SET statement_timeout='300s';",
                 "select count(*), sum(data::bigint)::bigint from t",
             ]
-        ) == [[], [(expect_nrows, expect_sum)]]
+        ) == [[], [], [(expect_nrows, expect_sum)]]
 
     validate_vanilla_equivalence(vanilla_pg)
 
@@ -317,10 +313,6 @@ def test_pgdata_import_smoke(
         br_initdb_endpoint.safe_psql("select * from othertable")
 
 
-@run_only_on_postgres(
-    [PgVersion.V14, PgVersion.V15, PgVersion.V16],
-    "newer control file catalog version and struct format isn't supported",
-)
 def test_fast_import_binary(
     test_output_dir,
     vanilla_pg: VanillaPostgres,

From d73f4a6470b022a4ca5a163adaf8efc79f6b7e15 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 27 Jan 2025 21:02:25 +0000
Subject: [PATCH 19/72] pageserver: retry wrapper on manifest upload (#10524)

## Problem

On remote storage errors (e.g. I/O timeout) uploading tenant manifest,
all of compaction could fail. This is a problem IRL because we shouldn't
abort compaction on a single IO error, and in tests because it generates
spurious failures.

Related:
https://github.com/orgs/neondatabase/projects/51/views/2?sliceBy%5Bvalue%5D=jcsp&pane=issue&itemId=93692919&issue=neondatabase%7Cneon%7C10389

## Summary of changes

- Use `backoff::retry` when uploading tenant manifest
---
 pageserver/src/tenant.rs | 46 +++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index efe89cb982..d359270be4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -37,6 +37,8 @@ use remote_timeline_client::manifest::{
     OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
 };
 use remote_timeline_client::UploadQueueNotReadyError;
+use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
+use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
@@ -5308,27 +5310,37 @@ impl Tenant {
             return Ok(());
         }
 
-        upload_tenant_manifest(
-            &self.remote_storage,
-            &self.tenant_shard_id,
-            self.generation,
-            &manifest,
+        // Remote storage does no retries internally, so wrap it
+        match backoff::retry(
+            || async {
+                upload_tenant_manifest(
+                    &self.remote_storage,
+                    &self.tenant_shard_id,
+                    self.generation,
+                    &manifest,
+                    &self.cancel,
+                )
+                .await
+            },
+            |_e| self.cancel.is_cancelled(),
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "uploading tenant manifest",
             &self.cancel,
         )
         .await
-        .map_err(|e| {
-            if self.cancel.is_cancelled() {
-                TenantManifestError::Cancelled
-            } else {
-                TenantManifestError::RemoteStorage(e)
+        {
+            None => Err(TenantManifestError::Cancelled),
+            Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled),
+            Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)),
+            Some(Ok(_)) => {
+                // Store the successfully uploaded manifest, so that future callers can avoid
+                // re-uploading the same thing.
+                *guard = Some(manifest);
+
+                Ok(())
             }
-        })?;
-
-        // Store the successfully uploaded manifest, so that future callers can avoid
-        // re-uploading the same thing.
-        *guard = Some(manifest);
-
-        Ok(())
+        }
     }
 }
 

From c8fbbb9b65587d25b9dbd3c8f21266ce07159d02 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 27 Jan 2025 22:06:05 +0100
Subject: [PATCH 20/72] Test ingest_benchmark with different stripe size and
 also PostgreSQL version 17 (#10510)

We want to verify if pageserver stripe size has an impact on ingest
performance.
We want to verify if ingest performance has improved or regressed with
postgres version 17.

## Summary of changes

- Allow to create new project with different postgres versions
- allow to pre-shard new project with different stripe sizes instead of
relying on storage manager to shard_split the project once a threshold
is exceeded

Replaces https://github.com/neondatabase/neon/pull/10509

Test run https://github.com/neondatabase/neon/actions/runs/12986410381
---
 .../actions/neon-project-create/action.yml    | 48 +++++++++++++++++++
 .github/workflows/ingest_benchmark.yml        | 33 ++++++++++---
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index f4a194639f..11f46bce8e 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -17,6 +17,31 @@ inputs:
   compute_units:
     description: '[Min, Max] compute units'
     default: '[1, 1]'
+  # settings below only needed if you want the project to be sharded from the beginning
+  shard_split_project:
+    description: 'by default new projects are not shard-split, specify true to shard-split'
+    required: false
+    default: 'false'
+  admin_api_key:
+    description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true'
+    required: false
+  shard_count:
+    description: 'Number of shards to split the project into, only applies if shard_split_project is true'
+    required: false
+    default: '8'
+  stripe_size:
+    description: 'Stripe size, optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true'
+    required: false
+    default: '32768'
+  psql_path:
+    description: 'Path to psql binary - it is caller responsibility to provision the psql binary'
+    required: false
+    default: '/tmp/neon/pg_install/v16/bin/psql'
+  libpq_lib_path:
+    description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
+    required: false
+    default: '/tmp/neon/pg_install/v16/lib'
+  
 
 outputs:
   dsn:
@@ -63,6 +88,23 @@ runs:
         echo "project_id=${project_id}" >> $GITHUB_OUTPUT
 
         echo "Project ${project_id} has been created"
+
+        if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
+          # determine tenant ID
+          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
+          
+          echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
+
+          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
+          echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
+          
+          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
+          curl -X PUT \
+            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
+            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
+            -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
+        fi
+
       env:
         API_HOST: ${{ inputs.api_host }}
         API_KEY: ${{ inputs.api_key }}
@@ -70,3 +112,9 @@ runs:
         POSTGRES_VERSION: ${{ inputs.postgres_version }}
         MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
         MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
+        SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
+        ADMIN_API_KEY: ${{ inputs.admin_api_key }}
+        SHARD_COUNT: ${{ inputs.shard_count }}
+        STRIPE_SIZE: ${{ inputs.stripe_size }}
+        PSQL: ${{ inputs.psql_path }}
+        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index fc33c0a980..7b303fa37a 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -28,7 +28,24 @@ jobs:
     strategy:
       fail-fast: false # allow other variants to continue even if one fails
       matrix:
-        target_project: [new_empty_project, large_existing_project]
+        include:
+          - target_project: new_empty_project_stripe_size_2048 
+            stripe_size: 2048 # 16 MiB
+            postgres_version: 16
+          - target_project: new_empty_project_stripe_size_32768
+            stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
+                               # while here it is sharded from the beginning with a shard size of 256 MiB
+            postgres_version: 16
+          - target_project: new_empty_project
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            postgres_version: 16
+          - target_project: new_empty_project
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            postgres_version: 17
+          - target_project: large_existing_project
+            stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
+            postgres_version: 16
+      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
     permissions:
       contents: write
       statuses: write
@@ -67,17 +84,21 @@ jobs:
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Create Neon Project
-      if: ${{ matrix.target_project == 'new_empty_project' }}
+      if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
       id: create-neon-project-ingest-target
       uses: ./.github/actions/neon-project-create
       with:
         region_id: aws-us-east-2
-        postgres_version: 16
+        postgres_version: ${{ matrix.postgres_version }}
         compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }}
+        admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} 
+        shard_count: 8
+        stripe_size: ${{ matrix.stripe_size }}
 
     - name: Initialize Neon project
-      if: ${{ matrix.target_project == 'new_empty_project' }}
+      if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
       env:
           BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
           NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
@@ -130,7 +151,7 @@ jobs:
         test_selection: performance/test_perf_ingest_using_pgcopydb.py
         run_in_parallel: false
         extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
-        pg_version: v16
+        pg_version: v${{ matrix.postgres_version }}
         save_perf_report: true
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
@@ -146,7 +167,7 @@ jobs:
         ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+"
 
     - name: Delete Neon Project
-      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
+      if: ${{ always() && startsWith(matrix.target_project, 'new_empty_project') }}
       uses: ./.github/actions/neon-project-delete
       with:
         project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}

From 62a717a2ca8218bacf90a36cc84ebb6aa2976711 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 28 Jan 2025 13:11:51 +0000
Subject: [PATCH 21/72] pageserver: use PS node id for SK appname (#10522)

## Problem

This one is fairly embarrassing. Safekeeper node id was used in the
pageserver application name
when connecting to safekeepers.

## Summary of changes

Use the right node id.

Closes https://github.com/neondatabase/neon/issues/10461
---
 .../tenant/timeline/walreceiver/walreceiver_connection.rs   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 01c272633c..d69e7dbd32 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -118,7 +118,7 @@ pub(super) async fn handle_walreceiver_connection(
     cancellation: CancellationToken,
     connect_timeout: Duration,
     ctx: RequestContext,
-    node: NodeId,
+    safekeeper_node: NodeId,
     ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
 
     let (replication_client, connection) = {
         let mut config = wal_source_connconf.to_tokio_postgres_config();
-        config.application_name(format!("pageserver-{}", node.0).as_str());
+        config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str());
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
         match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
             Ok(client_and_conn) => client_and_conn?,
@@ -162,7 +162,7 @@ pub(super) async fn handle_walreceiver_connection(
         latest_wal_update: Utc::now().naive_utc(),
         streaming_lsn: None,
         commit_lsn: None,
-        node,
+        node: safekeeper_node,
     };
     if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");

From ed942b05f7af31082e9e8c893910b757e3de76f3 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 28 Jan 2025 14:33:58 +0100
Subject: [PATCH 22/72] Revert "pageserver: revert flush backpressure"
 (#10402)" (#10533)

This reverts commit 9e55d798036d646cbc27cd0bb2c849ed2e48ce85.

We'll still need this until we can tune L0 flush backpressure and
compaction. I'll add a setting to disable this separately.
---
 pageserver/src/metrics.rs                  | 25 ++++++++++-
 pageserver/src/tenant/timeline.rs          | 38 +++++++++++++----
 test_runner/fixtures/metrics.py            |  1 +
 test_runner/regress/test_branching.py      | 13 ++++--
 test_runner/regress/test_remote_storage.py | 48 ++++++++++++++++++++++
 5 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5247a4a2ac..d2c778276d 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -398,6 +398,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_flush_wait_upload_seconds",
+        "Time spent waiting for preceding uploads during layer flush",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2569,6 +2578,7 @@ pub(crate) struct TimelineMetrics {
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
     pub flush_delay_histo: StorageTimeMetrics,
+    pub flush_wait_upload_time_gauge: Gauge,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2620,6 +2630,9 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2766,6 +2779,7 @@ impl TimelineMetrics {
             timeline_id,
             flush_time_histo,
             flush_delay_histo,
+            flush_wait_upload_time_gauge,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
@@ -2815,6 +2829,14 @@ impl TimelineMetrics {
         self.resident_physical_size_gauge.get()
     }
 
+    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
+        self.flush_wait_upload_time_gauge.add(duration);
+        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
+            .unwrap()
+            .add(duration);
+    }
+
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2832,6 +2854,7 @@ impl TimelineMetrics {
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 07689b5e76..61981db24a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -144,15 +144,19 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
-use super::remote_timeline_client::RemoteTimelineClient;
-use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
-use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
-use super::upload_queue::NotInitialized;
-use super::GcError;
 use super::{
-    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, MaybeOffloaded,
+    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
+    MaybeOffloaded,
+};
+use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
+use super::{
+    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
+    storage_layer::ReadableLayer,
+};
+use super::{
+    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
+    GcError,
 };
 
 #[cfg(test)]
@@ -4034,6 +4038,24 @@ impl Timeline {
             // release lock on 'layers'
         };
 
+        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
+        // This makes us refuse ingest until the new layers have been persisted to the remote
+        let start = Instant::now();
+        self.remote_client
+            .wait_completion()
+            .await
+            .map_err(|e| match e {
+                WaitCompletionError::UploadQueueShutDownOrStopped
+                | WaitCompletionError::NotInitialized(
+                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
+                ) => FlushLayerError::Cancelled,
+                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
+                    FlushLayerError::Other(anyhow!(e).into())
+                }
+            })?;
+        let duration = start.elapsed().as_secs_f64();
+        self.metrics.flush_wait_upload_time_gauge_add(duration);
+
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
         // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 37859901d4..fd7e193778 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -165,6 +165,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
+    "pageserver_flush_wait_upload_seconds",
     counter("pageserver_tenant_throttling_count_accounted_start"),
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index a4056404f0..34e4e994cb 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -19,7 +19,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
-from requests.exceptions import RetryError
 
 
 # Test branch creation
@@ -177,8 +176,11 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
 
         env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
 
-        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
-            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
+        with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
+            env.endpoints.create_start(
+                initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
+            )
+        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
     finally:
         env.pageserver.stop(immediate=True)
 
@@ -219,7 +221,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
 
         branch_id = TimelineId.generate()
 
-        with pytest.raises(RetryError, match="too many 503 error responses"):
+        with pytest.raises(
+            PageserverApiException,
+            match="Cannot branch off the timeline that's not present in pageserver",
+        ):
             ps_http.timeline_create(
                 env.pg_version,
                 env.initial_tenant,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index f6bc6f6f41..c39c74fa2a 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -786,6 +786,54 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
         create_thread.join()
 
 
+def test_paused_upload_stalls_checkpoint(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    This test checks that checkpoints block on uploads to remote storage.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # Set a small compaction threshold
+            "compaction_threshold": "3",
+            # Disable GC
+            "gc_period": "0s",
+            # disable PITR
+            "pitr_interval": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.append(
+        f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    client = env.pageserver.http_client()
+    layers_at_creation = client.layer_map_info(tenant_id, timeline_id)
+    deltas_at_creation = len(layers_at_creation.delta_layers())
+    assert (
+        deltas_at_creation == 1
+    ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle"
+
+    # Make new layer uploads get stuck.
+    # Note that timeline creation waits for the initial layers to reach remote storage.
+    # So at this point, the `layers_at_creation` are in remote storage.
+    client.configure_failpoints(("before-upload-layer-pausable", "pause"))
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        # Build two tables with some data inside
+        endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+        with pytest.raises(ReadTimeout):
+            client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
+
+
 def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):

From 83b6bfa229e2b983ab93a1e5401f3a31b59e3c03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 28 Jan 2025 14:39:53 +0100
Subject: [PATCH 23/72] Re-download layer if its local and on-disk metadata
 diverge (#10529)

In #10308, we noticed many warnings about the local layer having
different sizes on-disk compared to the metadata.

However, the layer downloader would never redownload layer files if the
sizes or generation numbers change. This is obviously a bug, which we
aim to fix with this PR.

This change also moves the code deciding what to do about a layer to a
dedicated function: before we handled the "routing" via control flow,
but now it's become too complicated and it is nicer to have the
different verdicts for a layer spelled out in a list/match.
---
 .../tenant/remote_timeline_client/index.rs    |   4 +
 pageserver/src/tenant/secondary/downloader.rs | 155 +++++++++++-------
 2 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3824bc8f11..b8b18005fd 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -222,6 +222,10 @@ impl LayerFileMetadata {
             shard,
         }
     }
+    /// Helper to get both generation and file size in a tuple
+    pub fn generation_file_size(&self) -> (Generation, u64) {
+        (self.generation, self.file_size)
+    }
 }
 
 /// Limited history of earlier ancestors.
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 395e34e404..cf524fcb25 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -559,6 +559,13 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
     }
 }
 
+enum LayerAction {
+    Download,
+    NoAction,
+    Skip,
+    Touch,
+}
+
 /// This type is a convenience to group together the various functions involved in
 /// freshening a secondary tenant.
 struct TenantDownloader<'a> {
@@ -1008,69 +1015,17 @@ impl<'a> TenantDownloader<'a> {
                 return (Err(UpdateError::Restart), touched);
             }
 
-            // Existing on-disk layers: just update their access time.
-            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
-                tracing::debug!("Layer {} is already on disk", layer.name);
-
-                if cfg!(debug_assertions) {
-                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
-                    // are already present on disk are really there.
-                    match tokio::fs::metadata(&on_disk.local_path).await {
-                        Ok(meta) => {
-                            tracing::debug!(
-                                "Layer {} present at {}, size {}",
-                                layer.name,
-                                on_disk.local_path,
-                                meta.len(),
-                            );
-                        }
-                        Err(e) => {
-                            tracing::warn!(
-                                "Layer {} not found at {} ({})",
-                                layer.name,
-                                on_disk.local_path,
-                                e
-                            );
-                            debug_assert!(false);
-                        }
-                    }
-                }
-
-                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
-                    // We already have this layer on disk.  Update its access time.
-                    tracing::debug!(
-                        "Access time updated for layer {}: {} -> {}",
-                        layer.name,
-                        strftime(&on_disk.access_time),
-                        strftime(&layer.access_time)
-                    );
-                    touched.push(layer);
-                }
-                continue;
-            } else {
-                tracing::debug!("Layer {} not present on disk yet", layer.name);
-            }
-
-            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
-            // recently than it was evicted.
-            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
-                if &layer.access_time > evicted_at {
-                    tracing::info!(
-                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
-                } else {
-                    tracing::trace!(
-                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
-                        layer.name,
-                        strftime(&layer.access_time),
-                        strftime(evicted_at)
-                    );
+            match self.layer_action(&timeline_state, &layer).await {
+                LayerAction::Download => (),
+                LayerAction::NoAction => continue,
+                LayerAction::Skip => {
                     self.skip_layer(layer);
                     continue;
                 }
+                LayerAction::Touch => {
+                    touched.push(layer);
+                    continue;
+                }
             }
 
             match self
@@ -1091,6 +1046,86 @@ impl<'a> TenantDownloader<'a> {
         (Ok(()), touched)
     }
 
+    async fn layer_action(
+        &self,
+        timeline_state: &SecondaryDetailTimeline,
+        layer: &HeatMapLayer,
+    ) -> LayerAction {
+        // Existing on-disk layers: just update their access time.
+        if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+            tracing::debug!("Layer {} is already on disk", layer.name);
+
+            if cfg!(debug_assertions) {
+                // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
+                // are already present on disk are really there.
+                match tokio::fs::metadata(&on_disk.local_path).await {
+                    Ok(meta) => {
+                        tracing::debug!(
+                            "Layer {} present at {}, size {}",
+                            layer.name,
+                            on_disk.local_path,
+                            meta.len(),
+                        );
+                    }
+                    Err(e) => {
+                        tracing::warn!(
+                            "Layer {} not found at {} ({})",
+                            layer.name,
+                            on_disk.local_path,
+                            e
+                        );
+                        debug_assert!(false);
+                    }
+                }
+            }
+
+            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
+                tracing::info!(
+                    "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
+                    layer.name,
+                    on_disk.metadata.generation_file_size(),
+                    on_disk.metadata.generation_file_size()
+                );
+                return LayerAction::Download;
+            }
+            if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
+                // We already have this layer on disk.  Update its access time.
+                tracing::debug!(
+                    "Access time updated for layer {}: {} -> {}",
+                    layer.name,
+                    strftime(&on_disk.access_time),
+                    strftime(&layer.access_time)
+                );
+                return LayerAction::Touch;
+            }
+            return LayerAction::NoAction;
+        } else {
+            tracing::debug!("Layer {} not present on disk yet", layer.name);
+        }
+
+        // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+        // recently than it was evicted.
+        if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+            if &layer.access_time > evicted_at {
+                tracing::info!(
+                    "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                    layer.name,
+                    strftime(&layer.access_time),
+                    strftime(evicted_at)
+                );
+            } else {
+                tracing::trace!(
+                    "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                    layer.name,
+                    strftime(&layer.access_time),
+                    strftime(evicted_at)
+                );
+                return LayerAction::Skip;
+            }
+        }
+        LayerAction::Download
+    }
+
     async fn download_timeline(
         &self,
         timeline: HeatMapTimeline,

From 47677ba578619cce5aa961840489515fbbfbbe3a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 28 Jan 2025 15:51:30 +0100
Subject: [PATCH 24/72] pageserver: disable L0 backpressure by default (#10535)

## Problem

We'll need further improvements to compaction before enabling L0 flush
backpressure by default. See:
https://neondb.slack.com/archives/C033RQ5SPDH/p1738066068960519?thread_ts=1737818888.474179&cid=C033RQ5SPDH.

Touches #5415.

## Summary of changes

Disable `l0_flush_delay_threshold` by default.
---
 libs/pageserver_api/src/config.rs |  7 +++----
 pageserver/src/tenant/timeline.rs | 20 +++++++-------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 5866145690..cc6e4a3699 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -260,11 +260,10 @@ pub struct TenantConfigToml {
     /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
     /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
     /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
-    /// blowing up. Should be >compaction_threshold. If None, defaults to 2 * compaction_threshold.
-    /// 0 to disable.
+    /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default.
     pub l0_flush_delay_threshold: Option<usize>,
-    /// Level0 delta layer threshold at which to stall layer flushes. 0 to disable. If None,
-    /// defaults to 4 * compaction_threshold. Must be >compaction_threshold to avoid deadlock.
+    /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
+    /// to avoid deadlock. 0 to disable. Disabled by default.
     pub l0_flush_stall_threshold: Option<usize>,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 61981db24a..280a3baa21 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2172,8 +2172,8 @@ impl Timeline {
     }
 
     fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
-        // Default to delay L0 flushes at 3x compaction threshold.
-        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3;
+        // Disable L0 flushes by default. This and compaction needs further tuning.
+        const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
 
         // If compaction is disabled, don't delay.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2201,10 +2201,9 @@ impl Timeline {
     }
 
     fn get_l0_flush_stall_threshold(&self) -> Option<usize> {
-        // Default to stall L0 flushes at 5x compaction threshold.
-        // TODO: stalls are temporarily disabled by default, see below.
-        #[allow(unused)]
-        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 5;
+        // Disable L0 stalls by default. In ingest benchmarks, we see image compaction take >10
+        // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long.
+        const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5
 
         // If compaction is disabled, don't stall.
         if self.get_compaction_period() == Duration::ZERO {
@@ -2236,13 +2235,8 @@ impl Timeline {
             return None;
         }
 
-        // Disable stalls by default. In ingest benchmarks, we see image compaction take >10
-        // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long.
-        //
-        // TODO: fix this.
-        // let l0_flush_stall_threshold = l0_flush_stall_threshold
-        //    .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
-        let l0_flush_stall_threshold = l0_flush_stall_threshold?;
+        let l0_flush_stall_threshold = l0_flush_stall_threshold
+            .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold);
 
         // 0 disables backpressure.
         if l0_flush_stall_threshold == 0 {

From 15fecb847476bcdbd0044a85b1a0a7e1d7d50d30 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 28 Jan 2025 09:32:59 -0600
Subject: [PATCH 25/72] Update axum to 0.8.1 (#10332)

Only a few things that needed updating:

- async_trait was removed
- Message::Text takes a Utf8Bytes object instead of a String

Signed-off-by: Tristan Partin <tristan@neon.tech>
Co-authored-by: Conrad Ludgate <connor@neon.tech>
---
 Cargo.lock                                  | 125 +++++++++++---------
 Cargo.toml                                  |   2 +-
 compute_tools/src/http/extract/json.rs      |   6 +-
 compute_tools/src/http/extract/path.rs      |   6 +-
 compute_tools/src/http/extract/query.rs     |   6 +-
 compute_tools/src/http/server.rs            |   2 +-
 deny.toml                                   |   2 +-
 libs/vm_monitor/src/dispatcher.rs           |  12 +-
 pageserver/compaction/src/simulator/draw.rs |  13 +-
 9 files changed, 87 insertions(+), 87 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a201a6abae..3c33901247 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -179,7 +179,7 @@ dependencies = [
  "nom",
  "num-traits",
  "rusticata-macros",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
@@ -718,14 +718,14 @@ dependencies = [
 
 [[package]]
 name = "axum"
-version = "0.7.9"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
 dependencies = [
- "async-trait",
  "axum-core",
  "base64 0.22.1",
  "bytes",
+ "form_urlencoded",
  "futures-util",
  "http 1.1.0",
  "http-body 1.0.0",
@@ -733,7 +733,7 @@ dependencies = [
  "hyper 1.4.1",
  "hyper-util",
  "itoa",
- "matchit 0.7.0",
+ "matchit",
  "memchr",
  "mime",
  "percent-encoding",
@@ -746,7 +746,7 @@ dependencies = [
  "sha1",
  "sync_wrapper 1.0.1",
  "tokio",
- "tokio-tungstenite 0.24.0",
+ "tokio-tungstenite 0.26.1",
  "tower 0.5.2",
  "tower-layer",
  "tower-service",
@@ -755,11 +755,10 @@ dependencies = [
 
 [[package]]
 name = "axum-core"
-version = "0.4.5"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733"
 dependencies = [
- "async-trait",
  "bytes",
  "futures-util",
  "http 1.1.0",
@@ -1130,7 +1129,7 @@ dependencies = [
  "log",
  "nix 0.25.1",
  "regex",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -1311,7 +1310,7 @@ dependencies = [
  "serde_with",
  "signal-hook",
  "tar",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-stream",
@@ -1420,7 +1419,7 @@ dependencies = [
  "serde",
  "serde_json",
  "storage_broker",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-util",
@@ -2264,7 +2263,7 @@ dependencies = [
  "pin-project",
  "rand 0.8.5",
  "sha1",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
 ]
@@ -3390,12 +3389,6 @@ dependencies = [
  "regex-automata 0.1.10",
 ]
 
-[[package]]
-name = "matchit"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
-
 [[package]]
 name = "matchit"
 version = "0.8.4"
@@ -3786,7 +3779,7 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "sha2",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
 ]
 
@@ -3836,7 +3829,7 @@ dependencies = [
  "futures-sink",
  "js-sys",
  "pin-project-lite",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
 ]
 
@@ -3868,7 +3861,7 @@ dependencies = [
  "opentelemetry_sdk",
  "prost",
  "reqwest",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -3904,7 +3897,7 @@ dependencies = [
  "percent-encoding",
  "rand 0.8.5",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
  "tracing",
@@ -4018,7 +4011,7 @@ dependencies = [
  "remote_storage",
  "serde_json",
  "svg_fmt",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "utils",
@@ -4094,7 +4087,7 @@ dependencies = [
  "strum_macros",
  "sysinfo",
  "tenant_size_model",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemallocator",
  "tokio",
  "tokio-epoll-uring",
@@ -4140,7 +4133,7 @@ dependencies = [
  "storage_broker",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "utils",
 ]
 
@@ -4155,7 +4148,7 @@ dependencies = [
  "postgres",
  "reqwest",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-stream",
@@ -4559,7 +4552,7 @@ dependencies = [
  "rustls 0.23.18",
  "rustls-pemfile 2.1.1",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-postgres 0.7.7",
  "tokio-postgres-rustls",
@@ -4597,7 +4590,7 @@ dependencies = [
  "pprof",
  "regex",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "utils",
 ]
@@ -4608,7 +4601,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "camino",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "workspace_hack",
 ]
@@ -4641,7 +4634,7 @@ dependencies = [
  "smallvec",
  "symbolic-demangle",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4673,7 +4666,7 @@ dependencies = [
  "postgres-protocol 0.6.4",
  "rand 0.8.5",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
 ]
 
@@ -4744,7 +4737,7 @@ dependencies = [
  "memchr",
  "parking_lot 0.12.1",
  "procfs",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -4914,7 +4907,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "subtle",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
@@ -5311,7 +5304,7 @@ dependencies = [
  "http 1.1.0",
  "reqwest",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tower-service",
 ]
 
@@ -5331,7 +5324,7 @@ dependencies = [
  "reqwest",
  "reqwest-middleware",
  "retry-policies",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -5347,7 +5340,7 @@ dependencies = [
  "async-trait",
  "getrandom 0.2.11",
  "http 1.1.0",
- "matchit 0.8.4",
+ "matchit",
  "opentelemetry",
  "reqwest",
  "reqwest-middleware",
@@ -5726,7 +5719,7 @@ dependencies = [
  "storage_broker",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
@@ -5765,7 +5758,7 @@ dependencies = [
  "reqwest",
  "safekeeper_api",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "utils",
  "workspace_hack",
 ]
@@ -5974,7 +5967,7 @@ dependencies = [
  "rand 0.8.5",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "url",
  "uuid",
@@ -6046,7 +6039,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6"
 dependencies = [
  "percent-encoding",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -6208,7 +6201,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085"
 dependencies = [
  "num-bigint",
  "num-traits",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
@@ -6353,7 +6346,7 @@ dependencies = [
  "serde_json",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "tracing",
@@ -6645,7 +6638,16 @@ version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
+dependencies = [
+ "thiserror-impl 2.0.11",
 ]
 
 [[package]]
@@ -6659,6 +6661,17 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "thiserror-impl"
+version = "2.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "thread_local"
 version = "1.1.7"
@@ -6815,7 +6828,7 @@ dependencies = [
  "nix 0.26.4",
  "once_cell",
  "scopeguard",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "tracing",
@@ -6998,14 +7011,14 @@ dependencies = [
 
 [[package]]
 name = "tokio-tungstenite"
-version = "0.24.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
+checksum = "be4bf6fecd69fcdede0ec680aaf474cdab988f9de6bc73d3758f0160e3b7025a"
 dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite 0.24.0",
+ "tungstenite 0.26.1",
 ]
 
 [[package]]
@@ -7315,16 +7328,16 @@ dependencies = [
  "log",
  "rand 0.8.5",
  "sha1",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
  "utf-8",
 ]
 
 [[package]]
 name = "tungstenite"
-version = "0.24.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
+checksum = "413083a99c579593656008130e29255e54dcaae495be556cc26888f211648c24"
 dependencies = [
  "byteorder",
  "bytes",
@@ -7334,7 +7347,7 @@ dependencies = [
  "log",
  "rand 0.8.5",
  "sha1",
- "thiserror",
+ "thiserror 2.0.11",
  "utf-8",
 ]
 
@@ -7529,7 +7542,7 @@ dependencies = [
  "signal-hook",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
  "tokio-tar",
@@ -7629,7 +7642,7 @@ dependencies = [
  "remote_storage",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tikv-jemallocator",
  "tokio",
  "tokio-util",
@@ -8158,7 +8171,7 @@ dependencies = [
  "ring",
  "signature 2.2.0",
  "spki 0.7.3",
- "thiserror",
+ "thiserror 1.0.69",
  "zeroize",
 ]
 
@@ -8175,7 +8188,7 @@ dependencies = [
  "nom",
  "oid-registry",
  "rusticata-macros",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 6e1e288895..9ccdb45f6d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
-axum = { version = "0.7.9", features = ["ws"] }
+axum = { version = "0.8.1", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.70"
diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs
index 41f13625ad..104cc25d5f 100644
--- a/compute_tools/src/http/extract/json.rs
+++ b/compute_tools/src/http/extract/json.rs
@@ -1,9 +1,6 @@
 use std::ops::{Deref, DerefMut};
 
-use axum::{
-    async_trait,
-    extract::{rejection::JsonRejection, FromRequest, Request},
-};
+use axum::extract::{rejection::JsonRejection, FromRequest, Request};
 use compute_api::responses::GenericAPIError;
 use http::StatusCode;
 
@@ -12,7 +9,6 @@ use http::StatusCode;
 #[derive(Debug, Clone, Copy, Default)]
 pub(crate) struct Json<T>(pub T);
 
-#[async_trait]
 impl<S, T> FromRequest<S> for Json<T>
 where
     axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs
index 95edc657f2..09637a96a4 100644
--- a/compute_tools/src/http/extract/path.rs
+++ b/compute_tools/src/http/extract/path.rs
@@ -1,9 +1,6 @@
 use std::ops::{Deref, DerefMut};
 
-use axum::{
-    async_trait,
-    extract::{rejection::PathRejection, FromRequestParts},
-};
+use axum::extract::{rejection::PathRejection, FromRequestParts};
 use compute_api::responses::GenericAPIError;
 use http::{request::Parts, StatusCode};
 
@@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode};
 #[derive(Debug, Clone, Copy, Default)]
 pub(crate) struct Path<T>(pub T);
 
-#[async_trait]
 impl<S, T> FromRequestParts<S> for Path<T>
 where
     axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs
index a1f1b0cef0..9dec3642cf 100644
--- a/compute_tools/src/http/extract/query.rs
+++ b/compute_tools/src/http/extract/query.rs
@@ -1,9 +1,6 @@
 use std::ops::{Deref, DerefMut};
 
-use axum::{
-    async_trait,
-    extract::{rejection::QueryRejection, FromRequestParts},
-};
+use axum::extract::{rejection::QueryRejection, FromRequestParts};
 use compute_api::responses::GenericAPIError;
 use http::{request::Parts, StatusCode};
 
@@ -12,7 +9,6 @@ use http::{request::Parts, StatusCode};
 #[derive(Debug, Clone, Copy, Default)]
 pub(crate) struct Query<T>(pub T);
 
-#[async_trait]
 impl<S, T> FromRequestParts<S> for Query<T>
 where
     axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 40fb1f4b4d..da650585fc 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -55,7 +55,7 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
         .route("/database_schema", get(database_schema::get_schema_dump))
         .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
         .route(
-            "/extension_server/*filename",
+            "/extension_server/{*filename}",
             post(extension_server::download_extension),
         )
         .route("/extensions", post(extensions::install_extension))
diff --git a/deny.toml b/deny.toml
index ff8d71cda5..df00a34c60 100644
--- a/deny.toml
+++ b/deny.toml
@@ -41,8 +41,8 @@ allow = [
     "MIT",
     "MPL-2.0",
     "OpenSSL",
-    "Unicode-DFS-2016",
     "Unicode-3.0",
+    "Zlib",
 ]
 confidence-threshold = 0.8
 exceptions = [
diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs
index 6a965ace9b..c81848cb70 100644
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -7,7 +7,7 @@
 //! (notifying it of upscale).
 
 use anyhow::{bail, Context};
-use axum::extract::ws::{Message, WebSocket};
+use axum::extract::ws::{Message, Utf8Bytes, WebSocket};
 use futures::{
     stream::{SplitSink, SplitStream},
     SinkExt, StreamExt,
@@ -82,21 +82,21 @@ impl Dispatcher {
 
         let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
             Ok(version) => {
-                sink.send(Message::Text(
+                sink.send(Message::Text(Utf8Bytes::from(
                     serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
-                ))
+                )))
                 .await
                 .context("failed to notify agent of negotiated protocol version")?;
                 version
             }
             Err(e) => {
-                sink.send(Message::Text(
+                sink.send(Message::Text(Utf8Bytes::from(
                     serde_json::to_string(&ProtocolResponse::Error(format!(
                         "Received protocol version range {} which does not overlap with {}",
                         agent_range, monitor_range
                     )))
                     .unwrap(),
-                ))
+                )))
                 .await
                 .context("failed to notify agent of no overlap between protocol version ranges")?;
                 Err(e).context("error determining suitable protocol version range")?
@@ -126,7 +126,7 @@ impl Dispatcher {
 
         let json = serde_json::to_string(&message).context("failed to serialize message")?;
         self.sink
-            .send(Message::Text(json))
+            .send(Message::Text(Utf8Bytes::from(json)))
             .await
             .context("stream error sending message")
     }
diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs
index 997925067f..4559db09f1 100644
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -160,9 +160,12 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
 
         // Fill in and thicken rectangle if it's an
         // image layer so that we can see it.
-        let mut style = Style::default();
-        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
+        let mut style = Style {
+            fill: Fill::Color(rgb(0x80, 0x80, 0x80)),
+            stroke: Stroke::Color(rgb(0, 0, 0), 0.5),
+            opacity: 1.0,
+            stroke_opacity: 1.0,
+        };
 
         let y_start = lsn_max - lsn_start;
         let y_end = lsn_max - lsn_end;
@@ -214,10 +217,6 @@ pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output:
         files_seen.insert(f);
     }
 
-    let mut record_style = Style::default();
-    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-    record_style.stroke = Stroke::None;
-
     writeln!(svg, "{}", EndSvg)?;
 
     let mut layer_events_str = String::new();

From ae4b2af299d555ca66ecc8cb79602163ae86d48b Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 28 Jan 2025 18:08:17 +0100
Subject: [PATCH 26/72] fix(proxy): Use correct identifier for usage metrics
 upload (#10538)

## Problem

The request data and usage metrics S3 requests use the same identifier
shown in logs, causing confusion about what type of upload failed.

## Summary of changes

Use the correct identifier for usage metrics uploads.

neondatabase/cloud#23084
---
 proxy/src/context/parquet.rs | 4 ++--
 proxy/src/usage_metrics.rs   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index d7ffff0483..4f1dd39d92 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -423,11 +423,11 @@ async fn upload_parquet(
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")
+    .with_context(|| format!("request_data_upload: path={path}"))
     .err();
 
     if let Some(err) = maybe_err {
-        tracing::error!(%id, error = ?err, "failed to upload request data");
+        tracing::error!(%id, %path, error = ?err, "failed to upload request data");
     }
 
     Ok(buffer.writer())
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 487504d709..e1cc7e87b4 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -396,13 +396,13 @@ async fn upload_backup_events(
         TimeoutOrCancel::caused_by_cancel,
         FAILED_UPLOAD_WARN_THRESHOLD,
         FAILED_UPLOAD_MAX_RETRIES,
-        "request_data_upload",
+        "usage_metrics_upload",
         cancel,
     )
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")?;
+    .with_context(|| format!("usage_metrics_upload: path={remote_path}"))?;
     Ok(())
 }
 

From 1010b8add432da557928d4708f21ed002893cfb7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 28 Jan 2025 18:21:05 +0100
Subject: [PATCH 27/72] pageserver: add `l0_flush_wait_upload` setting (#10534)

## Problem

We need a setting to disable the flush upload wait, to test L0 flush
backpressure in staging.

## Summary of changes

Add `l0_flush_wait_upload` setting.
---
 control_plane/src/pageserver.rs               |  5 +++
 libs/pageserver_api/src/config.rs             |  7 ++++
 libs/pageserver_api/src/models.rs             |  6 +++
 pageserver/src/tenant.rs                      |  1 +
 pageserver/src/tenant/config.rs               | 11 +++++
 pageserver/src/tenant/timeline.rs             | 41 ++++++++++++-------
 .../regress/test_attach_tenant_config.py      |  1 +
 7 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 967810ee06..52527ffa90 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -357,6 +357,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
+            l0_flush_wait_upload: settings
+                .remove("l0_flush_wait_upload")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?,
             l0_flush_stall_threshold: settings
                 .remove("l0_flush_stall_threshold")
                 .map(|x| x.parse::<usize>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index cc6e4a3699..40c8837af5 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -265,6 +265,10 @@ pub struct TenantConfigToml {
     /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
     /// to avoid deadlock. 0 to disable. Disabled by default.
     pub l0_flush_stall_threshold: Option<usize>,
+    /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
+    /// layer. This is a temporary backpressure mechanism which should be removed once
+    /// l0_flush_{delay,stall}_threshold is fully enabled.
+    pub l0_flush_wait_upload: bool,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -522,6 +526,8 @@ pub mod tenant_conf_defaults {
     pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
         crate::models::CompactionAlgorithm::Legacy;
 
+    pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
+
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
 
     // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
@@ -562,6 +568,7 @@ impl Default for TenantConfigToml {
             },
             l0_flush_delay_threshold: None,
             l0_flush_stall_threshold: None,
+            l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 16473415b4..dcc233c7c4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -466,6 +466,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub l0_flush_stall_threshold: FieldPatch<usize>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_wait_upload: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_horizon: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_period: FieldPatch<String>,
@@ -524,6 +526,7 @@ pub struct TenantConfig {
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub l0_flush_delay_threshold: Option<usize>,
     pub l0_flush_stall_threshold: Option<usize>,
+    pub l0_flush_wait_upload: Option<bool>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -559,6 +562,7 @@ impl TenantConfig {
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
+            mut l0_flush_wait_upload,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -597,6 +601,7 @@ impl TenantConfig {
         patch
             .l0_flush_stall_threshold
             .apply(&mut l0_flush_stall_threshold);
+        patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch.gc_period.apply(&mut gc_period);
         patch
@@ -651,6 +656,7 @@ impl TenantConfig {
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
+            l0_flush_wait_upload,
             gc_horizon,
             gc_period,
             image_creation_threshold,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d359270be4..4385fe9a9b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5467,6 +5467,7 @@ pub(crate) mod harness {
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                 l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
                 l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
+                l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
                 gc_horizon: Some(tenant_conf.gc_horizon),
                 gc_period: Some(tenant_conf.gc_period),
                 image_creation_threshold: Some(tenant_conf.image_creation_threshold),
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c870ca97b8..50da998c30 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -289,6 +289,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub l0_flush_stall_threshold: Option<usize>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub l0_flush_wait_upload: Option<bool>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub gc_horizon: Option<u64>,
@@ -408,6 +412,9 @@ impl TenantConfOpt {
             l0_flush_stall_threshold: self
                 .l0_flush_stall_threshold
                 .or(global_conf.l0_flush_stall_threshold),
+            l0_flush_wait_upload: self
+                .l0_flush_wait_upload
+                .unwrap_or(global_conf.l0_flush_wait_upload),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -474,6 +481,7 @@ impl TenantConfOpt {
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
+            mut l0_flush_wait_upload,
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
@@ -518,6 +526,7 @@ impl TenantConfOpt {
         patch
             .l0_flush_stall_threshold
             .apply(&mut l0_flush_stall_threshold);
+        patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
         patch.gc_horizon.apply(&mut gc_horizon);
         patch
             .gc_period
@@ -590,6 +599,7 @@ impl TenantConfOpt {
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
+            l0_flush_wait_upload,
             gc_horizon,
             gc_period,
             image_creation_threshold,
@@ -649,6 +659,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_threshold: value.compaction_threshold,
             l0_flush_delay_threshold: value.l0_flush_delay_threshold,
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
+            l0_flush_wait_upload: value.l0_flush_wait_upload,
             gc_horizon: value.gc_horizon,
             gc_period: value.gc_period.map(humantime),
             image_creation_threshold: value.image_creation_threshold,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 280a3baa21..de990a9fe4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2250,6 +2250,14 @@ impl Timeline {
         Some(max(l0_flush_stall_threshold, compaction_threshold))
     }
 
+    fn get_l0_flush_wait_upload(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .l0_flush_wait_upload
+            .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload)
+    }
+
     fn get_image_creation_threshold(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -4034,21 +4042,24 @@ impl Timeline {
 
         // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
         // This makes us refuse ingest until the new layers have been persisted to the remote
-        let start = Instant::now();
-        self.remote_client
-            .wait_completion()
-            .await
-            .map_err(|e| match e {
-                WaitCompletionError::UploadQueueShutDownOrStopped
-                | WaitCompletionError::NotInitialized(
-                    NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                ) => FlushLayerError::Cancelled,
-                WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                    FlushLayerError::Other(anyhow!(e).into())
-                }
-            })?;
-        let duration = start.elapsed().as_secs_f64();
-        self.metrics.flush_wait_upload_time_gauge_add(duration);
+        // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead.
+        if self.get_l0_flush_wait_upload() {
+            let start = Instant::now();
+            self.remote_client
+                .wait_completion()
+                .await
+                .map_err(|e| match e {
+                    WaitCompletionError::UploadQueueShutDownOrStopped
+                    | WaitCompletionError::NotInitialized(
+                        NotInitialized::ShuttingDown | NotInitialized::Stopped,
+                    ) => FlushLayerError::Cancelled,
+                    WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
+                        FlushLayerError::Other(anyhow!(e).into())
+                    }
+                })?;
+            let duration = start.elapsed().as_secs_f64();
+            self.metrics.flush_wait_upload_time_gauge_add(duration);
+        }
 
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 1fdba223ad..8b92e4c442 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -141,6 +141,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "compaction_threshold": 13,
         "l0_flush_delay_threshold": 25,
         "l0_flush_stall_threshold": 42,
+        "l0_flush_wait_upload": True,
         "compaction_target_size": 1048576,
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",

From c54cd9e76ab3ace8acd9ec945c7d439ac872756d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 28 Jan 2025 17:33:07 +0000
Subject: [PATCH 28/72] storcon: signal LSN wait to pageserver during live
 migration (#10452)

## Problem

We've seen the ingest connection manager get stuck shortly after a
migration.

## Summary of changes

A speculative mitigation is to use the same mechanism as get page
requests for kicking LSN ingest. The connection manager monitors
LSN waits and queries the broker if no updates are received for the
timeline.

Closes https://github.com/neondatabase/neon/issues/10351
---
 libs/pageserver_api/src/models.rs             |  7 ++
 pageserver/client/src/mgmt_api.rs             | 15 +++++
 pageserver/src/http/routes.rs                 | 64 +++++++++++++++++++
 pageserver/src/page_service.rs                |  2 +
 pageserver/src/tenant.rs                      |  7 +-
 pageserver/src/tenant/mgr.rs                  |  1 +
 pageserver/src/tenant/timeline.rs             | 28 ++++++--
 .../walreceiver/connection_manager.rs         |  2 +-
 storage_controller/src/pageserver_client.rs   | 18 +++++-
 storage_controller/src/reconciler.rs          | 61 +++++++++++++++++-
 10 files changed, 193 insertions(+), 12 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index dcc233c7c4..16f89ae13b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1021,6 +1021,13 @@ pub struct TenantConfigPatchRequest {
     pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantWaitLsnRequest {
+    #[serde(flatten)]
+    pub timelines: HashMap<TimelineId, Lsn>,
+    pub timeout: Duration,
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4e9b11879d..0359bfcd0b 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -763,4 +763,19 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn wait_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        request: TenantWaitLsnRequest,
+    ) -> Result<StatusCode> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/wait_lsn",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request_noerror(Method::POST, uri, request)
+            .await
+            .map(|resp| resp.status())
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5452719bcd..0f3e9fdab6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;
 
 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
+use futures::future::join_all;
 use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
@@ -40,6 +41,7 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
+use pageserver_api::models::TenantWaitLsnRequest;
 use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TimelineCreateRequestMode;
 use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
@@ -95,6 +97,8 @@ use crate::tenant::timeline::CompactOptions;
 use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
+use crate::tenant::timeline::WaitLsnTimeout;
+use crate::tenant::timeline::WaitLsnWaiter;
 use crate::tenant::GetTimelineError;
 use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
@@ -2790,6 +2794,63 @@ async fn secondary_download_handler(
     json_response(status, progress)
 }
 
+async fn wait_lsn_handler(
+    mut request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let wait_lsn_request: TenantWaitLsnRequest = json_request(&mut request).await?;
+
+    let state = get_state(&request);
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+
+    let mut wait_futures = Vec::default();
+    for timeline in tenant.list_timelines() {
+        let Some(lsn) = wait_lsn_request.timelines.get(&timeline.timeline_id) else {
+            continue;
+        };
+
+        let fut = {
+            let timeline = timeline.clone();
+            let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
+            async move {
+                timeline
+                    .wait_lsn(
+                        *lsn,
+                        WaitLsnWaiter::HttpEndpoint,
+                        WaitLsnTimeout::Custom(wait_lsn_request.timeout),
+                        &ctx,
+                    )
+                    .await
+            }
+        };
+        wait_futures.push(fut);
+    }
+
+    if wait_futures.is_empty() {
+        return json_response(StatusCode::NOT_FOUND, ());
+    }
+
+    let all_done = tokio::select! {
+        results = join_all(wait_futures) => {
+            results.iter().all(|res| res.is_ok())
+        },
+        _ = cancel.cancelled() => {
+            return Err(ApiError::Cancelled);
+        }
+    };
+
+    let status = if all_done {
+        StatusCode::OK
+    } else {
+        StatusCode::ACCEPTED
+    };
+
+    json_response(status, ())
+}
+
 async fn secondary_status_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -3577,6 +3638,9 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
             api_handler(r, secondary_download_handler)
         })
+        .post("/v1/tenant/:tenant_shard_id/wait_lsn", |r| {
+            api_handler(r, wait_lsn_handler)
+        })
         .put("/v1/tenant/:tenant_shard_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e5063b7fc2..e103338c7c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1708,6 +1708,7 @@ impl PageServerHandler {
                 .wait_lsn(
                     not_modified_since,
                     crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
                     ctx,
                 )
                 .await?;
@@ -2044,6 +2045,7 @@ impl PageServerHandler {
                 .wait_lsn(
                     lsn,
                     crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    crate::tenant::timeline::WaitLsnTimeout::Default,
                     ctx,
                 )
                 .await?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4385fe9a9b..4361fa3d66 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2560,7 +2560,12 @@ impl Tenant {
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
                     ancestor_timeline
-                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
+                        .wait_lsn(
+                            *lsn,
+                            timeline::WaitLsnWaiter::Tenant,
+                            timeline::WaitLsnTimeout::Default,
+                            ctx,
+                        )
                         .await
                         .map_err(|e| match e {
                             e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index e8b0d1d4dd..dfa89a765c 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1643,6 +1643,7 @@ impl TenantManager {
                         .wait_lsn(
                             *target_lsn,
                             crate::tenant::timeline::WaitLsnWaiter::Tenant,
+                            crate::tenant::timeline::WaitLsnTimeout::Default,
                             ctx,
                         )
                         .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index de990a9fe4..076220df51 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -901,10 +901,17 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     }
 }
 
+pub(crate) enum WaitLsnTimeout {
+    Custom(Duration),
+    // Use the [`PageServerConf::wait_lsn_timeout`] default
+    Default,
+}
+
 pub(crate) enum WaitLsnWaiter<'a> {
     Timeline(&'a Timeline),
     Tenant,
     PageService,
+    HttpEndpoint,
 }
 
 /// Argument to [`Timeline::shutdown`].
@@ -1301,6 +1308,7 @@ impl Timeline {
         &self,
         lsn: Lsn,
         who_is_waiting: WaitLsnWaiter<'_>,
+        timeout: WaitLsnTimeout,
         ctx: &RequestContext, /* Prepare for use by cancellation */
     ) -> Result<(), WaitLsnError> {
         let state = self.current_state();
@@ -1317,7 +1325,7 @@ impl Timeline {
                 | TaskKind::WalReceiverConnectionPoller => {
                     let is_myself = match who_is_waiting {
                         WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
-                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
+                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService | WaitLsnWaiter::HttpEndpoint => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
                     };
                     if is_myself {
                         if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
@@ -1333,13 +1341,14 @@ impl Timeline {
             }
         }
 
+        let timeout = match timeout {
+            WaitLsnTimeout::Custom(t) => t,
+            WaitLsnTimeout::Default => self.conf.wait_lsn_timeout,
+        };
+
         let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
 
-        match self
-            .last_record_lsn
-            .wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
-            .await
-        {
+        match self.last_record_lsn.wait_for_timeout(lsn, timeout).await {
             Ok(()) => Ok(()),
             Err(e) => {
                 use utils::seqwait::SeqWaitError::*;
@@ -3590,7 +3599,12 @@ impl Timeline {
             }
         }
         ancestor
-            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
+            .wait_lsn(
+                self.ancestor_lsn,
+                WaitLsnWaiter::Timeline(self),
+                WaitLsnTimeout::Default,
+                ctx,
+            )
             .await
             .map_err(|e| match e {
                 e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index d8ea32cd45..65f9d39078 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -274,7 +274,7 @@ pub(super) async fn connection_manager_loop_step(
                     };
 
                 last_discovery_ts = Some(std::time::Instant::now());
-                debug!("No active connection and no candidates, sending discovery request to the broker");
+                info!("No active connection and no candidates, sending discovery request to the broker");
 
                 // Cancellation safety: we want to send a message to the broker, but publish_one()
                 // function can get cancelled by the other select! arm. This is absolutely fine, because
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index b19cbc4fa3..141ff6f720 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -2,8 +2,9 @@ use pageserver_api::{
     models::{
         detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
         PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest,
-        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
+        TenantShardSplitRequest, TenantShardSplitResponse, TenantWaitLsnRequest,
+        TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest,
+        TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -299,4 +300,17 @@ impl PageserverClient {
             self.inner.top_tenant_shards(request).await
         )
     }
+
+    pub(crate) async fn wait_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        request: TenantWaitLsnRequest,
+    ) -> Result<StatusCode> {
+        measured_request!(
+            "wait_lsn",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.wait_lsn(tenant_shard_id, request).await
+        )
+    }
 }
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index adced3b77d..03db947263 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -3,7 +3,7 @@ use crate::persistence::Persistence;
 use crate::{compute_hook, service};
 use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy};
 use pageserver_api::models::{
-    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
@@ -348,6 +348,32 @@ impl Reconciler {
         Ok(())
     }
 
+    async fn wait_lsn(
+        &self,
+        node: &Node,
+        tenant_shard_id: TenantShardId,
+        timelines: HashMap<TimelineId, Lsn>,
+    ) -> Result<StatusCode, ReconcileError> {
+        const TIMEOUT: Duration = Duration::from_secs(10);
+
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.service_config.jwt_token.as_deref(),
+        );
+
+        client
+            .wait_lsn(
+                tenant_shard_id,
+                TenantWaitLsnRequest {
+                    timelines,
+                    timeout: TIMEOUT,
+                },
+            )
+            .await
+            .map_err(|e| e.into())
+    }
+
     async fn get_lsns(
         &self,
         tenant_shard_id: TenantShardId,
@@ -461,6 +487,39 @@ impl Reconciler {
         node: &Node,
         baseline: HashMap<TimelineId, Lsn>,
     ) -> anyhow::Result<()> {
+        // Signal to the pageserver that it should ingest up to the baseline LSNs.
+        loop {
+            match self.wait_lsn(node, tenant_shard_id, baseline.clone()).await {
+                Ok(StatusCode::OK) => {
+                    // Everything is caught up
+                    return Ok(());
+                }
+                Ok(StatusCode::ACCEPTED) => {
+                    // Some timelines are not caught up yet.
+                    // They'll be polled below.
+                    break;
+                }
+                Ok(StatusCode::NOT_FOUND) => {
+                    // None of the timelines are present on the pageserver.
+                    // This is correct if they've all been deleted, but
+                    // let let the polling loop below cross check.
+                    break;
+                }
+                Ok(status_code) => {
+                    tracing::warn!(
+                        "Unexpected status code ({status_code}) returned by wait_lsn endpoint"
+                    );
+                    break;
+                }
+                Err(e) => {
+                    tracing::info!("🕑 Can't trigger LSN wait on {node} yet, waiting ({e})",);
+                    tokio::time::sleep(Duration::from_millis(500)).await;
+                    continue;
+                }
+            }
+        }
+
+        // Poll the LSNs until they catch up
         loop {
             let latest = match self.get_lsns(tenant_shard_id, node).await {
                 Ok(l) => l,

From f5fdaa6dc6d68df1979c9766a4300a364a20bb4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Tue, 28 Jan 2025 20:13:39 +0100
Subject: [PATCH 29/72] feat(ci): generate basic release notes with links
 (#10511)

## Problem
https://github.com/neondatabase/neon/pull/10448 removed release notes,
because if their generation failed, the whole release was failing.
People liked them though, and wanted some basic release notes as a
fall-back instead of completely removing them.

## Summary of changes
Include basic release notes that link to the release PR and to a diff to
the previous release.
---
 .github/workflows/build_and_test.yml | 41 ++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 32b99d9c38..b23e3612d6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1050,6 +1050,7 @@ jobs:
           retries: 5
           script: |
             const tag = "${{ needs.tag.outputs.build-tag }}";
+            const branch = "${{ github.ref_name }}";
 
             try {
               const existingRef = await github.rest.git.getRef({
@@ -1092,12 +1093,48 @@ jobs:
               }
 
               console.log(`Release for tag ${tag} does not exist. Creating it...`);
+
+              // Find the PR number using the commit SHA
+              const pullRequests = await github.rest.pulls.list({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                state: 'closed',
+                base: branch,
+              });
+              
+              const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha);
+              const prNumber = pr ? pr.number : null;
+
+              // Find the previous release on the branch
+              const releases = await github.rest.repos.listReleases({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                per_page: 100,
+              });
+
+              const branchReleases = releases.data
+                .filter((release) => {
+                  const regex = new RegExp(`^${branch}-\\d+$`);
+                  return regex.test(release.tag_name) && !release.draft && !release.prerelease;
+                })
+                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
+
+              const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null;
+
+              const releaseNotes = [
+                prNumber
+                  ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.`
+                  : 'Release PR not found.',
+                previousTag
+                  ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.`
+                  : `No previous release found on branch ${branch}.`,
+              ].join('\n\n');
+
               await github.rest.repos.createRelease({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
                 tag_name: tag,
-                // TODO: Automate release notes properly
-                generate_release_notes: false,
+                body: releaseNotes,
               });
               console.log(`Release for tag ${tag} created successfully.`);
             }

From d04d924649069a5adaf6baad4ac62af9e50724b5 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 28 Jan 2025 20:24:07 +0100
Subject: [PATCH 30/72] feat(compute): Add some basic compute_ctl metrics
 (#10504)

## Problem

There are several parts of `compute_ctl` with a very low visibility of
errors:
1. DB migrations that run async in the background after compute start.
2. Requests made to control plane (currently only `GetSpec`).
3. Requests made to the remote extensions server.

## Summary of changes

Add new counters to quickly evaluate the amount of errors among the
fleet.

Part of neondatabase/cloud#17590
---
 compute_tools/src/extension_server.rs         | 65 +++++++++++---
 compute_tools/src/http/routes/metrics.rs      |  7 +-
 compute_tools/src/installed_extensions.rs     | 18 +---
 compute_tools/src/lib.rs                      |  1 +
 compute_tools/src/metrics.rs                  | 90 +++++++++++++++++++
 compute_tools/src/migration.rs                | 44 +++++----
 compute_tools/src/spec.rs                     | 41 ++++++---
 .../regress/test_compute_migrations.py        | 12 +++
 .../regress/test_download_extensions.py       | 14 ++-
 9 files changed, 230 insertions(+), 62 deletions(-)
 create mode 100644 compute_tools/src/metrics.rs

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index f13b2308e7..fa638c74b3 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -85,6 +85,8 @@ use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
 
+use crate::metrics::{REMOTE_EXT_REQUESTS_FAILED, REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
+
 fn get_pg_config(argument: &str, pgbin: &str) -> String {
     // gives the result of `pg_config [argument]`
     // where argument is a flag like `--version` or `--sharedir`
@@ -258,21 +260,60 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 
     info!("Download extension {:?} from uri {:?}", ext_path, uri);
 
-    let resp = reqwest::get(uri).await?;
+    REMOTE_EXT_REQUESTS_TOTAL.with_label_values(&[]).inc();
 
-    match resp.status() {
+    match do_extension_server_request(&uri).await {
+        Ok(resp) => {
+            info!(
+                "Successfully downloaded remote extension data {:?}",
+                ext_path
+            );
+            Ok(resp)
+        }
+        Err((msg, status)) => {
+            let status_str = status
+                .map(|s| s.to_string())
+                .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
+            REMOTE_EXT_REQUESTS_FAILED
+                .with_label_values(&[&status_str])
+                .inc();
+            bail!(msg);
+        }
+    }
+}
+
+// Do a single remote extensions server request.
+// Return result or (error message + status code) in case of any failures.
+async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option<StatusCode>)> {
+    let resp = reqwest::get(uri).await.map_err(|e| {
+        (
+            format!("could not perform remote extensions server request: {}", e),
+            None,
+        )
+    })?;
+    let status = resp.status();
+
+    match status {
         StatusCode::OK => match resp.bytes().await {
-            Ok(resp) => {
-                info!("Download extension {:?} completed successfully", ext_path);
-                Ok(resp)
-            }
-            Err(e) => bail!("could not deserialize remote extension response: {}", e),
+            Ok(resp) => Ok(resp),
+            Err(e) => Err((
+                format!("could not read remote extensions server response: {}", e),
+                // It's fine to return and report error with status as 200 OK,
+                // because we still failed to read the response.
+                Some(status),
+            )),
         },
-        StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
-        _ => bail!(
-            "unexpected remote extension response status code: {}",
-            resp.status()
-        ),
+        StatusCode::SERVICE_UNAVAILABLE => Err((
+            "remote extensions server is temporarily unavailable".to_string(),
+            Some(status),
+        )),
+        _ => Err((
+            format!(
+                "unexpected remote extensions server response status code: {}",
+                status
+            ),
+            Some(status),
+        )),
     }
 }
 
diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs
index 40d71b5de7..13150a7588 100644
--- a/compute_tools/src/http/routes/metrics.rs
+++ b/compute_tools/src/http/routes/metrics.rs
@@ -2,17 +2,16 @@ use axum::{body::Body, response::Response};
 use http::header::CONTENT_TYPE;
 use http::StatusCode;
 use metrics::proto::MetricFamily;
-use metrics::Encoder;
-use metrics::TextEncoder;
+use metrics::{Encoder, TextEncoder};
 
-use crate::{http::JsonResponse, installed_extensions};
+use crate::{http::JsonResponse, metrics::collect};
 
 /// Expose Prometheus metrics.
 pub(in crate::http) async fn get_metrics() -> Response {
     // When we call TextEncoder::encode() below, it will immediately return an
     // error if a metric family has no metrics, so we need to preemptively
     // filter out metric families with no metrics.
-    let metrics = installed_extensions::collect()
+    let metrics = collect()
         .into_iter()
         .filter(|m| !m.get_metric().is_empty())
         .collect::<Vec<MetricFamily>>();
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 0ab259ddf1..173dbf40b0 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,13 +1,10 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
-use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 
 use anyhow::Result;
 use postgres::{Client, NoTls};
 
-use metrics::core::Collector;
-use metrics::{register_uint_gauge_vec, UIntGaugeVec};
-use once_cell::sync::Lazy;
+use crate::metrics::INSTALLED_EXTENSIONS;
 
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
@@ -102,16 +99,3 @@ pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<In
         extensions: extensions_map.into_values().collect(),
     })
 }
-
-static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "compute_installed_extensions",
-        "Number of databases where the version of extension is installed",
-        &["extension_name", "version", "owned_by_superuser"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub fn collect() -> Vec<MetricFamily> {
-    INSTALLED_EXTENSIONS.collect()
-}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 12fea4e61a..b08df22134 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -16,6 +16,7 @@ pub mod extension_server;
 pub mod installed_extensions;
 pub mod local_proxy;
 pub mod lsn_lease;
+pub mod metrics;
 mod migration;
 pub mod monitor;
 pub mod params;
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
new file mode 100644
index 0000000000..3684338571
--- /dev/null
+++ b/compute_tools/src/metrics.rs
@@ -0,0 +1,90 @@
+use metrics::core::Collector;
+use metrics::proto::MetricFamily;
+use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec};
+use once_cell::sync::Lazy;
+
+pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "compute_installed_extensions",
+        "Number of databases where the version of extension is installed",
+        &["extension_name", "version", "owned_by_superuser"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH,
+// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
+// And it's fair to call it a 'RPC' (Remote Procedure Call).
+pub enum CPlaneRequestRPC {
+    GetSpec,
+}
+
+impl CPlaneRequestRPC {
+    pub fn as_str(&self) -> &str {
+        match self {
+            CPlaneRequestRPC::GetSpec => "GetSpec",
+        }
+    }
+}
+
+pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
+
+pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_cplane_requests_total",
+        "Total number of control plane requests made by compute_ctl",
+        &["rpc"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_cplane_requests_failed_total",
+        "Total number of failed control plane requests made by compute_ctl",
+        &["rpc", "http_status"]
+    )
+    .expect("failed to define a metric")
+});
+
+/// Total number of failed database migrations. Per-compute, this is actually a boolean metric,
+/// either empty or with a single value (1, migration_id) because we stop at the first failure.
+/// Yet, the sum over the fleet will provide the total number of failures.
+pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_db_migration_failed_total",
+        "Total number of failed database migrations",
+        &["migration_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_remote_ext_requests_total",
+        "Total number of requests made by compute_ctl to download extensions from S3 proxy",
+        // Do not use any labels like extension name yet.
+        // We can add them later if needed.
+        &[]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "compute_ctl_remote_ext_requests_failed_total",
+        "Total number of failed requests to S3 proxy",
+        &["http_status"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub fn collect() -> Vec<MetricFamily> {
+    let mut metrics = INSTALLED_EXTENSIONS.collect();
+    metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
+    metrics.extend(CPLANE_REQUESTS_FAILED.collect());
+    metrics.extend(DB_MIGRATION_FAILED.collect());
+    metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
+    metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect());
+    metrics
+}
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index 45c33172f7..aa3c6b01f0 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,7 +1,9 @@
 use anyhow::{Context, Result};
 use fail::fail_point;
 use postgres::{Client, Transaction};
-use tracing::info;
+use tracing::{error, info};
+
+use crate::metrics::DB_MIGRATION_FAILED;
 
 /// Runs a series of migrations on a target database
 pub(crate) struct MigrationRunner<'m> {
@@ -78,24 +80,31 @@ impl<'m> MigrationRunner<'m> {
         Ok(())
     }
 
-    /// Run an individual migration
-    fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
+    /// Run an individual migration in a separate transaction block.
+    fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
+        let mut txn = client
+            .transaction()
+            .with_context(|| format!("begin transaction for migration {migration_id}"))?;
+
         if migration.starts_with("-- SKIP") {
             info!("Skipping migration id={}", migration_id);
 
             // Even though we are skipping the migration, updating the
             // migration ID should help keep logic easy to understand when
             // trying to understand the state of a cluster.
-            Self::update_migration_id(txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id)?;
         } else {
             info!("Running migration id={}:\n{}\n", migration_id, migration);
 
             txn.simple_query(migration)
                 .with_context(|| format!("apply migration {migration_id}"))?;
 
-            Self::update_migration_id(txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id)?;
         }
 
+        txn.commit()
+            .with_context(|| format!("commit transaction for migration {migration_id}"))?;
+
         Ok(())
     }
 
@@ -109,19 +118,20 @@ impl<'m> MigrationRunner<'m> {
             // The index lags the migration ID by 1, so the current migration
             // ID is also the next index
             let migration_id = (current_migration + 1) as i64;
+            let migration = self.migrations[current_migration];
 
-            let mut txn = self
-                .client
-                .transaction()
-                .with_context(|| format!("begin transaction for migration {migration_id}"))?;
-
-            Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
-                .with_context(|| format!("running migration {migration_id}"))?;
-
-            txn.commit()
-                .with_context(|| format!("commit transaction for migration {migration_id}"))?;
-
-            info!("Finished migration id={}", migration_id);
+            match Self::run_migration(self.client, migration_id, migration) {
+                Ok(_) => {
+                    info!("Finished migration id={}", migration_id);
+                }
+                Err(e) => {
+                    error!("Failed to run migration id={}: {}", migration_id, e);
+                    DB_MIGRATION_FAILED
+                        .with_label_values(&[migration_id.to_string().as_str()])
+                        .inc();
+                    return Err(e);
+                }
+            }
 
             current_migration += 1;
         }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index c7d2deb090..01de13811f 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -6,6 +6,9 @@ use std::path::Path;
 use tracing::{error, info, instrument, warn};
 
 use crate::config;
+use crate::metrics::{
+    CPlaneRequestRPC, CPLANE_REQUESTS_FAILED, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS,
+};
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -19,7 +22,7 @@ use compute_api::spec::ComputeSpec;
 fn do_control_plane_request(
     uri: &str,
     jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String)> {
+) -> Result<ControlPlaneSpecResponse, (bool, String, Option<StatusCode>)> {
     let resp = reqwest::blocking::Client::new()
         .get(uri)
         .header("Authorization", format!("Bearer {}", jwt))
@@ -28,34 +31,41 @@ fn do_control_plane_request(
             (
                 true,
                 format!("could not perform spec request to control plane: {}", e),
+                None,
             )
         })?;
 
-    match resp.status() {
+    let status = resp.status();
+    match status {
         StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
                 format!("could not deserialize control plane response: {}", e),
+                Some(status),
             )),
         },
-        StatusCode::SERVICE_UNAVAILABLE => {
-            Err((true, "control plane is temporarily unavailable".to_string()))
-        }
+        StatusCode::SERVICE_UNAVAILABLE => Err((
+            true,
+            "control plane is temporarily unavailable".to_string(),
+            Some(status),
+        )),
         StatusCode::BAD_GATEWAY => {
             // We have a problem with intermittent 502 errors now
             // https://github.com/neondatabase/cloud/issues/2353
             // It's fine to retry GET request in this case.
-            Err((true, "control plane request failed with 502".to_string()))
+            Err((
+                true,
+                "control plane request failed with 502".to_string(),
+                Some(status),
+            ))
         }
         // Another code, likely 500 or 404, means that compute is unknown to the control plane
         // or some internal failure happened. Doesn't make much sense to retry in this case.
         _ => Err((
             false,
-            format!(
-                "unexpected control plane response status code: {}",
-                resp.status()
-            ),
+            format!("unexpected control plane response status code: {}", status),
+            Some(status),
         )),
     }
 }
@@ -82,6 +92,9 @@ pub fn get_spec_from_control_plane(
     // - no spec for compute yet (Empty state) -> return Ok(None)
     // - got spec -> return Ok(Some(spec))
     while attempt < 4 {
+        CPLANE_REQUESTS_TOTAL
+            .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str()])
+            .inc();
         spec = match do_control_plane_request(&cp_uri, &jwt) {
             Ok(spec_resp) => match spec_resp.status {
                 ControlPlaneComputeStatus::Empty => Ok(None),
@@ -93,7 +106,13 @@ pub fn get_spec_from_control_plane(
                     }
                 }
             },
-            Err((retry, msg)) => {
+            Err((retry, msg, status)) => {
+                let status_str = status
+                    .map(|s| s.to_string())
+                    .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
+                CPLANE_REQUESTS_FAILED
+                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status_str])
+                    .inc();
                 if retry {
                     Err(anyhow!(msg))
                 } else {
diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py
index 803702a6f8..ec2e38f021 100644
--- a/test_runner/regress/test_compute_migrations.py
+++ b/test_runner/regress/test_compute_migrations.py
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, cast
 
 import pytest
 from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
+from fixtures.metrics import parse_metrics
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv
@@ -33,6 +34,17 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
             migration_id = cast("int", cur.fetchall()[0][0])
             assert migration_id == i - 1
 
+        # Check that migration failure is properly recorded in the metrics
+        client = endpoint.http_client()
+        raw_metrics = client.metrics()
+        metrics = parse_metrics(raw_metrics)
+        failed_migration = metrics.query_all(
+            "compute_ctl_db_migration_failed_total",
+        )
+        assert len(failed_migration) == 1
+        for sample in failed_migration:
+            assert sample.value == 1
+
         endpoint.stop()
 
     endpoint.start()
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index f18f4e78bd..d7e6e9de56 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
@@ -128,6 +129,17 @@ def test_remote_extensions(
 
     httpserver.check()
 
+    # Check that we properly recorded downloads in the metrics
+    client = endpoint.http_client()
+    raw_metrics = client.metrics()
+    metrics = parse_metrics(raw_metrics)
+    remote_ext_requests = metrics.query_all(
+        "compute_ctl_remote_ext_requests_total",
+    )
+    assert len(remote_ext_requests) == 1
+    for sample in remote_ext_requests:
+        assert sample.value == 1
+
 
 # TODO
 # 1. Test downloading remote library.
@@ -137,7 +149,7 @@ def test_remote_extensions(
 #
 # 3.Test that extension is downloaded after endpoint restart,
 # when the library is used in the query.
-# Run the test with mutliple simultaneous connections to an endpoint.
+# Run the test with multiple simultaneous connections to an endpoint.
 # to ensure that the extension is downloaded only once.
 #
 # 4. Test that private extensions are only downloaded when they are present in the spec.

From 68cf0ba4399a2b027a331b74142108b41d5fc0d0 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Tue, 28 Jan 2025 22:26:38 +0100
Subject: [PATCH 31/72] run benchmark tests on small-metal runners (#10549)

## Problem
Ref: https://github.com/neondatabase/cloud/issues/23314

We suspect some inconsistency in Benchmark tests runs could be due to
different type of runners they are landed in.
To have that aligned in both terms: failure rates and benchmark results,
lets run them for now on `small-metal` servers and see the progress for
the tests stability.

## Summary of changes
---
 .github/actionlint.yml               | 1 +
 .github/workflows/build_and_test.yml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index aec5b4ee75..ecff0cc70b 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -4,6 +4,7 @@ self-hosted-runner:
     - large
     - large-arm64
     - small
+    - small-metal
     - small-arm64
     - us-east-2
 config-variables:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b23e3612d6..99658187a8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -242,7 +242,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, small ]
+    runs-on: [ self-hosted, small-metal ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
@@ -1101,7 +1101,7 @@ jobs:
                 state: 'closed',
                 base: branch,
               });
-              
+
               const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha);
               const prNumber = pr ? pr.number : null;
 

From b735df6ff01472f017ffed25ac7da3ca615829ad Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 28 Jan 2025 16:29:51 -0500
Subject: [PATCH 32/72] fix(pageserver): make image layer generation atomic
 (#10516)

## Problem

close https://github.com/neondatabase/neon/issues/8362

## Summary of changes

Use `BatchLayerWriter` to ensure we clean up image layers after failed
compaction.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer.rs        |   1 +
 .../storage_layer/batch_split_writer.rs       |  17 +++
 pageserver/src/tenant/timeline.rs             | 105 ++++++++++--------
 pageserver/src/tenant/timeline/compaction.rs  |   7 +-
 4 files changed, 79 insertions(+), 51 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index c24d037dde..c1fe67c87c 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -33,6 +33,7 @@ use utils::sync::gate::GateGuard;
 
 use utils::lsn::Lsn;
 
+pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter};
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 8a397ceb7a..22d8b81bcc 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -87,6 +87,23 @@ impl BatchLayerWriter {
         ));
     }
 
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let res = self
+            .finish_with_discard_fn(tline, ctx, |_| async { false })
+            .await?;
+        let mut output = Vec::new();
+        for r in res {
+            if let BatchWriterResult::Produced(layer) = r {
+                output.push(layer);
+            }
+        }
+        Ok(output)
+    }
+
     pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 076220df51..2033ebcdeb 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -70,6 +70,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
 use crate::l0_flush::{self, L0FlushGlobalState};
+use crate::tenant::storage_layer::ImageLayerName;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     page_service::TenantManagerTypes,
@@ -78,7 +79,7 @@ use crate::{
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::{
-            inmemory_layer::IndexEntry, IoConcurrency, PersistentLayerDesc,
+            inmemory_layer::IndexEntry, BatchLayerWriter, IoConcurrency, PersistentLayerDesc,
             ValueReconstructSituation,
         },
     },
@@ -933,7 +934,7 @@ pub(crate) enum ShutdownMode {
 }
 
 struct ImageLayerCreationOutcome {
-    image: Option<ResidentLayer>,
+    unfinished_image_layer: Option<ImageLayerWriter>,
     next_start_key: Key,
 }
 
@@ -4405,11 +4406,15 @@ impl Timeline {
         if wrote_keys {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-            info!("created image layer for rel {}", image_layer.local_path());
+            info!(
+                "produced image layer for rel {}",
+                ImageLayerName {
+                    key_range: img_range.clone(),
+                    lsn
+                },
+            );
             Ok(ImageLayerCreationOutcome {
-                image: Some(image_layer),
+                unfinished_image_layer: Some(image_layer_writer),
                 next_start_key: img_range.end,
             })
         } else {
@@ -4419,7 +4424,7 @@ impl Timeline {
             // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             Ok(ImageLayerCreationOutcome {
-                image: None,
+                unfinished_image_layer: None,
                 next_start_key: start,
             })
         }
@@ -4468,7 +4473,7 @@ impl Timeline {
 
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
             return Ok(ImageLayerCreationOutcome {
-                image: None,
+                unfinished_image_layer: None,
                 next_start_key: img_range.end,
             });
         }
@@ -4494,14 +4499,15 @@ impl Timeline {
         if wrote_any_image {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             info!(
                 "created image layer for metadata {}",
-                image_layer.local_path()
+                ImageLayerName {
+                    key_range: img_range.clone(),
+                    lsn
+                }
             );
             Ok(ImageLayerCreationOutcome {
-                image: Some(image_layer),
+                unfinished_image_layer: Some(image_layer_writer),
                 next_start_key: img_range.end,
             })
         } else {
@@ -4511,7 +4517,7 @@ impl Timeline {
             // layer we write will cover the key range that we just scanned.
             tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
             Ok(ImageLayerCreationOutcome {
-                image: None,
+                unfinished_image_layer: None,
                 next_start_key: start,
             })
         }
@@ -4578,7 +4584,6 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
-        let mut image_layers = Vec::new();
 
         // We need to avoid holes between generated image layers.
         // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
@@ -4593,6 +4598,8 @@ impl Timeline {
 
         let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
 
+        let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
+
         for partition in partitioning.parts.iter() {
             if self.cancel.is_cancelled() {
                 return Err(CreateImageLayersError::Cancelled);
@@ -4665,45 +4672,45 @@ impl Timeline {
                     .map_err(|_| CreateImageLayersError::Cancelled)?,
             );
 
-            if !compact_metadata {
-                let ImageLayerCreationOutcome {
-                    image,
-                    next_start_key,
-                } = self
-                    .create_image_layer_for_rel_blocks(
-                        partition,
-                        image_layer_writer,
-                        lsn,
-                        ctx,
-                        img_range,
-                        start,
-                        io_concurrency,
-                    )
-                    .await?;
-
-                start = next_start_key;
-                image_layers.extend(image);
+            let ImageLayerCreationOutcome {
+                unfinished_image_layer,
+                next_start_key,
+            } = if !compact_metadata {
+                self.create_image_layer_for_rel_blocks(
+                    partition,
+                    image_layer_writer,
+                    lsn,
+                    ctx,
+                    img_range.clone(),
+                    start,
+                    io_concurrency,
+                )
+                .await?
             } else {
-                let ImageLayerCreationOutcome {
-                    image,
-                    next_start_key,
-                } = self
-                    .create_image_layer_for_metadata_keys(
-                        partition,
-                        image_layer_writer,
-                        lsn,
-                        ctx,
-                        img_range,
-                        mode,
-                        start,
-                        io_concurrency,
-                    )
-                    .await?;
-                start = next_start_key;
-                image_layers.extend(image);
+                self.create_image_layer_for_metadata_keys(
+                    partition,
+                    image_layer_writer,
+                    lsn,
+                    ctx,
+                    img_range.clone(),
+                    mode,
+                    start,
+                    io_concurrency,
+                )
+                .await?
+            };
+            start = next_start_key;
+            if let Some(unfinished_image_layer) = unfinished_image_layer {
+                batch_image_writer.add_unfinished_image_writer(
+                    unfinished_image_layer,
+                    img_range,
+                    lsn,
+                );
             }
         }
 
+        let image_layers = batch_image_writer.finish(self, ctx).await?;
+
         let mut guard = self.layers.write().await;
 
         // FIXME: we could add the images to be uploaded *before* returning from here, but right
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 28c3381318..ad19738bc2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -3197,7 +3197,7 @@ impl TimelineAdaptor {
         // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
         let start = Key::MIN;
         let ImageLayerCreationOutcome {
-            image,
+            unfinished_image_layer,
             next_start_key: _,
         } = self
             .timeline
@@ -3212,7 +3212,10 @@ impl TimelineAdaptor {
             )
             .await?;
 
-        if let Some(image_layer) = image {
+        if let Some(image_layer_writer) = unfinished_image_layer {
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer =
+                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
             self.new_images.push(image_layer);
         }
 

From 983e18e63ee60960ebf8f9b5a3833aab861e3792 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 28 Jan 2025 18:18:32 -0500
Subject: [PATCH 33/72] feat(pageserver): add compaction_upper_limit config
 (#10550)

## Problem

Follow-up of the incident, we should not use the same bound on
lower/upper limit of compaction files. This patch adds an upper bound
limit, which is set to 50 for now.

## Summary of changes

Add `compaction_upper_limit`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 control_plane/src/pageserver.rs                  |  5 +++++
 libs/pageserver_api/src/config.rs                | 12 ++++++++++++
 libs/pageserver_api/src/models.rs                |  8 ++++++++
 pageserver/src/http/openapi_spec.yml             |  2 ++
 pageserver/src/tenant.rs                         |  8 ++++++++
 pageserver/src/tenant/config.rs                  | 13 +++++++++++++
 pageserver/src/tenant/timeline.rs                |  8 ++++++++
 pageserver/src/tenant/timeline/compaction.rs     | 13 ++-----------
 test_runner/regress/test_attach_tenant_config.py |  1 +
 9 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 52527ffa90..383c174684 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -347,6 +347,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'compaction_threshold' as an integer")?,
+            compaction_upper_limit: settings
+                .remove("compaction_upper_limit")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'compaction_upper_limit' as an integer")?,
             compaction_algorithm: settings
                 .remove("compaction_algorithm")
                 .map(serde_json::from_str)
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 40c8837af5..422da0dc95 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -256,6 +256,11 @@ pub struct TenantConfigToml {
     pub compaction_period: Duration,
     /// Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
+    /// Controls the amount of L0 included in a single compaction iteration.
+    /// The unit is `checkpoint_distance`, i.e., a size.
+    /// We add L0s to the set of layers to compact until their cumulative
+    /// size exceeds `compaction_upper_limit * checkpoint_distance`.
+    pub compaction_upper_limit: usize,
     pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
     /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
     /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
@@ -523,6 +528,12 @@ pub mod tenant_conf_defaults {
 
     pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+
+    // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
+    // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
+    // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
+
     pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
         crate::models::CompactionAlgorithm::Legacy;
 
@@ -563,6 +574,7 @@ impl Default for TenantConfigToml {
             compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                 .expect("cannot parse default compaction period"),
             compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT,
             compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                 kind: DEFAULT_COMPACTION_ALGORITHM,
             },
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 16f89ae13b..43447c67bd 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -458,6 +458,8 @@ pub struct TenantConfigPatch {
     pub compaction_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_upper_limit: FieldPatch<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
@@ -522,6 +524,7 @@ pub struct TenantConfig {
     pub compaction_target_size: Option<u64>,
     pub compaction_period: Option<String>,
     pub compaction_threshold: Option<usize>,
+    pub compaction_upper_limit: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub l0_flush_delay_threshold: Option<usize>,
@@ -559,6 +562,7 @@ impl TenantConfig {
             mut compaction_target_size,
             mut compaction_period,
             mut compaction_threshold,
+            mut compaction_upper_limit,
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
@@ -594,6 +598,9 @@ impl TenantConfig {
             .apply(&mut compaction_target_size);
         patch.compaction_period.apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch
+            .compaction_upper_limit
+            .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
         patch
             .l0_flush_delay_threshold
@@ -653,6 +660,7 @@ impl TenantConfig {
             compaction_target_size,
             compaction_period,
             compaction_threshold,
+            compaction_upper_limit,
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index ee43440534..4b976e7f6f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -984,6 +984,8 @@ components:
           type: string
         compaction_threshold:
           type: string
+        compaction_upper_limit:
+          type: string
         image_creation_threshold:
           type: integer
         walreceiver_connect_timeout:
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4361fa3d66..085f76c05d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3816,6 +3816,13 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    pub fn get_compaction_upper_limit(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_upper_limit
+            .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
+    }
+
     pub fn get_gc_horizon(&self) -> u64 {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
@@ -5469,6 +5476,7 @@ pub(crate) mod harness {
                 compaction_target_size: Some(tenant_conf.compaction_target_size),
                 compaction_period: Some(tenant_conf.compaction_period),
                 compaction_threshold: Some(tenant_conf.compaction_threshold),
+                compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
                 compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                 l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
                 l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 50da998c30..139ed27bd2 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -277,6 +277,10 @@ pub struct TenantConfOpt {
     #[serde(default)]
     pub compaction_threshold: Option<usize>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_upper_limit: Option<usize>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
@@ -401,6 +405,9 @@ impl TenantConfOpt {
             compaction_threshold: self
                 .compaction_threshold
                 .unwrap_or(global_conf.compaction_threshold),
+            compaction_upper_limit: self
+                .compaction_upper_limit
+                .unwrap_or(global_conf.compaction_upper_limit),
             compaction_algorithm: self
                 .compaction_algorithm
                 .as_ref()
@@ -478,6 +485,7 @@ impl TenantConfOpt {
             mut compaction_target_size,
             mut compaction_period,
             mut compaction_threshold,
+            mut compaction_upper_limit,
             mut compaction_algorithm,
             mut l0_flush_delay_threshold,
             mut l0_flush_stall_threshold,
@@ -519,6 +527,9 @@ impl TenantConfOpt {
             .map(|v| humantime::parse_duration(&v))?
             .apply(&mut compaction_period);
         patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch
+            .compaction_upper_limit
+            .apply(&mut compaction_upper_limit);
         patch.compaction_algorithm.apply(&mut compaction_algorithm);
         patch
             .l0_flush_delay_threshold
@@ -596,6 +607,7 @@ impl TenantConfOpt {
             compaction_target_size,
             compaction_period,
             compaction_threshold,
+            compaction_upper_limit,
             compaction_algorithm,
             l0_flush_delay_threshold,
             l0_flush_stall_threshold,
@@ -657,6 +669,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             compaction_target_size: value.compaction_target_size,
             compaction_period: value.compaction_period.map(humantime),
             compaction_threshold: value.compaction_threshold,
+            compaction_upper_limit: value.compaction_upper_limit,
             l0_flush_delay_threshold: value.l0_flush_delay_threshold,
             l0_flush_stall_threshold: value.l0_flush_stall_threshold,
             l0_flush_wait_upload: value.l0_flush_wait_upload,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2033ebcdeb..f3cdad82d9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2181,6 +2181,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    fn get_compaction_upper_limit(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .compaction_upper_limit
+            .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
+    }
+
     fn get_l0_flush_delay_threshold(&self) -> Option<usize> {
         // Disable L0 flushes by default. This and compaction needs further tuning.
         const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ad19738bc2..76dcc159ea 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -47,9 +47,7 @@ use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
-use pageserver_api::config::tenant_conf_defaults::{
-    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
-};
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
 
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpace;
@@ -1117,14 +1115,7 @@ impl Timeline {
         // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
         // work in this function to only operate on this much delta data at once.
-        //
-        // Take the max of the configured value & the default, so that tests that configure tiny values
-        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
-        // still let them compact a full stack of L0s in one go.
-        let delta_size_limit = std::cmp::max(
-            self.get_compaction_threshold(),
-            DEFAULT_COMPACTION_THRESHOLD,
-        ) as u64
+        let delta_size_limit = self.get_compaction_upper_limit() as u64
             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
 
         let mut fully_compacted = true;
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 8b92e4c442..e88d245c8f 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -139,6 +139,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
     fully_custom_config = {
         "compaction_period": "1h",
         "compaction_threshold": 13,
+        "compaction_upper_limit": 100,
         "l0_flush_delay_threshold": 25,
         "l0_flush_stall_threshold": 42,
         "l0_flush_wait_upload": True,

From 9ab13d6e2c6f2877a89bb6963923f1ef767c1fc3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 29 Jan 2025 12:16:00 +0300
Subject: [PATCH 34/72] Log statements in test_layer_map (#10554)

## Problem

test_layer_map doesn't log statements and it is not clear how long they
take.

## Summary of changes

Do log them.

ref https://github.com/neondatabase/neon/issues/10409
---
 test_runner/performance/test_layer_map.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 9b159c5fcf..efc7fa59db 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -31,6 +31,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
 
     endpoint = env.endpoints.create_start("main", tenant_id=tenant)
     cur = endpoint.connect().cursor()
+    cur.execute("set log_statement = 'all'")
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
         cur.execute(f"insert into t values (generate_series(1,{n_records}))")

From 9f81828429ad6475b4fbb1a814240213b74bec63 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 29 Jan 2025 10:19:11 +0100
Subject: [PATCH 35/72] Test extension upgrade compatibility (#10244)

## Problem
We have to test the extensions, shipped with Neon for compatibility
before the upgrade.
## Summary of changes
Added the test for compatibility with the upgraded extensions.
---
 .github/workflows/build_and_test.yml          | 25 +++++
 .../compute_wrapper/shell/compute.sh          | 67 ++++++++-----
 docker-compose/docker-compose.yml             |  4 +-
 .../ext-src/hll-src/test-upgrade.sh           |  5 +
 .../ext-src/hypopg-src/test-upgrade.patch     | 27 ++++++
 .../ext-src/hypopg-src/test-upgrade.sh        |  6 ++
 .../ext-src/ip4r-src/test-upgrade.patch       | 23 +++++
 .../ext-src/ip4r-src/test-upgrade.sh          |  6 ++
 .../ext-src/pg_cron-src/test-upgrade.patch    | 75 +++++++++++++++
 .../ext-src/pg_cron-src/test-upgrade.sh       |  6 ++
 .../ext-src/pg_ivm-src/test-upgrade.patch     | 18 ++++
 .../ext-src/pg_ivm-src/test-upgrade.sh        |  6 ++
 .../pg_roaringbitmap-src/test-upgrade.patch   | 25 +++++
 .../pg_roaringbitmap-src/test-upgrade.sh      |  6 ++
 .../ext-src/pg_semver-src/test-upgrade.patch  | 24 +++++
 .../ext-src/pg_semver-src/test-upgrade.sh     |  6 ++
 .../ext-src/pg_uuidv7-src/test-upgrade.sh     |  5 +
 .../ext-src/pgvector-src/test-upgrade.sh      |  5 +
 .../ext-src/plv8-src/test-upgrade.sh          |  5 +
 .../postgresql-unit-src/test-upgrade.sh       |  5 +
 .../ext-src/prefix-src/test-upgrade.sh        |  5 +
 .../ext-src/rum-src/test-upgrade.patch        | 19 ++++
 .../ext-src/rum-src/test-upgrade.sh           |  6 ++
 docker-compose/test_extensions_upgrade.sh     | 93 +++++++++++++++++++
 24 files changed, 450 insertions(+), 22 deletions(-)
 create mode 100755 docker-compose/ext-src/hll-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/hypopg-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/hypopg-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/ip4r-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/ip4r-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_cron-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_cron-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/pg_semver-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/pg_semver-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/pgvector-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/plv8-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
 create mode 100755 docker-compose/ext-src/prefix-src/test-upgrade.sh
 create mode 100644 docker-compose/ext-src/rum-src/test-upgrade.patch
 create mode 100755 docker-compose/ext-src/rum-src/test-upgrade.sh
 create mode 100755 docker-compose/test_extensions_upgrade.sh

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 99658187a8..e588fc5a0e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -786,6 +786,17 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
+      - name: Get the last compute release tag
+        id: get-last-compute-release-tag
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "/repos/${{ github.repository }}/releases")
+          echo tag=${tag} >> ${GITHUB_OUTPUT}
+
       # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
       # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
       # Regular pageserver version string looks like
@@ -817,6 +828,20 @@ jobs:
           TEST_VERSION_ONLY: ${{ matrix.pg_version }}
         run: ./docker-compose/docker_compose_test.sh
 
+      - name: Print logs and clean up docker-compose test
+        if: always()
+        run: |
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
+
+      - name: Test extension upgrade
+        timeout-minutes: 20
+        if: ${{ needs.tag.outputs.build-tag == github.run_id }}
+        env:
+          NEWTAG: ${{ needs.tag.outputs.build-tag }}
+          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+        run: ./docker-compose/test_extensions_upgrade.sh
+
       - name: Print logs and clean up
         if: always()
         run: |
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 33455e458a..b4f8d3d66a 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -20,30 +20,55 @@ while ! nc -z pageserver 6400; do
 done
 echo "Page server is ready."
 
-echo "Create a tenant and timeline"
-generate_id tenant_id
-PARAMS=(
-     -X PUT
-     -H "Content-Type: application/json"
-     -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
-     "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
-)
-result=$(curl "${PARAMS[@]}")
-echo $result | jq .
+cp ${SPEC_FILE_ORG} ${SPEC_FILE}
 
-generate_id timeline_id
-PARAMS=(
-     -sbf
-     -X POST
-     -H "Content-Type: application/json"
-     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
-     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
-)
-result=$(curl "${PARAMS[@]}")
-echo $result | jq .
+ if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
+   tenant_id=${TENANT_ID}
+   timeline_id=${TIMELINE_ID}
+else
+  echo "Check if a tenant present"
+  PARAMS=(
+       -X GET
+       -H "Content-Type: application/json"
+       "http://pageserver:9898/v1/tenant"
+  )
+  tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
+  if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
+    echo "Create a tenant"
+    generate_id tenant_id
+    PARAMS=(
+         -X PUT
+         -H "Content-Type: application/json"
+         -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
+        "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
+    )
+    result=$(curl "${PARAMS[@]}")
+    echo $result | jq .
+  fi
+
+  echo "Check if a timeline present"
+  PARAMS=(
+       -X GET
+       -H "Content-Type: application/json"
+       "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
+  )
+  timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
+  if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
+    generate_id timeline_id
+    PARAMS=(
+        -sbf
+        -X POST
+        -H "Content-Type: application/json"
+        -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
+        "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+    )
+    result=$(curl "${PARAMS[@]}")
+    echo $result | jq .
+  fi
+fi
 
 echo "Overwrite tenant id and timeline id in spec file"
-sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
 sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
 
 cat ${SPEC_FILE}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 95e4b6fde7..489d60f38c 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -149,11 +149,13 @@ services:
       args:
         - REPOSITORY=${REPOSITORY:-neondatabase}
         - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
-        - TAG=${TAG:-latest}
+        - TAG=${COMPUTE_TAG:-${TAG:-latest}}
         - http_proxy=${http_proxy:-}
         - https_proxy=${https_proxy:-}
     environment:
       - PG_VERSION=${PG_VERSION:-16}
+      - TENANT_ID=${TENANT_ID:-}
+      - TIMELINE_ID=${TIMELINE_ID:-}
       #- RUST_BACKTRACE=1
     # Mount the test files directly, for faster editing cycle.
     volumes:
diff --git a/docker-compose/ext-src/hll-src/test-upgrade.sh b/docker-compose/ext-src/hll-src/test-upgrade.sh
new file mode 100755
index 0000000000..f9e9aedcb2
--- /dev/null
+++ b/docker-compose/ext-src/hll-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression add_agg agg_oob auto_sparse card_op cast_shape copy_binary cumulative_add_cardinality_correction cumulative_add_comprehensive_promotion cumulative_add_sparse_edge cumulative_add_sparse_random cumulative_add_sparse_step cumulative_union_comprehensive cumulative_union_explicit_explicit cumulative_union_explicit_promotion cumulative_union_probabilistic_probabilistic cumulative_union_sparse_full_representation cumulative_union_sparse_promotion cumulative_union_sparse_sparse disable_hashagg equal explicit_thresh hash hash_any meta_func murmur_bigint murmur_bytea nosparse notequal scalar_oob storedproc transaction typmod typmod_insert union_op
\ No newline at end of file
diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.patch b/docker-compose/ext-src/hypopg-src/test-upgrade.patch
new file mode 100644
index 0000000000..71fe26b164
--- /dev/null
+++ b/docker-compose/ext-src/hypopg-src/test-upgrade.patch
@@ -0,0 +1,27 @@
+diff --git a/expected/hypopg.out b/expected/hypopg.out
+index 90121d0..859260b 100644
+--- a/expected/hypopg.out
++++ b/expected/hypopg.out
+@@ -11,7 +11,8 @@ BEGIN
+ END;
+ $_$
+ LANGUAGE plpgsql;
+-CREATE EXTENSION hypopg;
++CREATE EXTENSION IF NOT EXISTS hypopg;
++NOTICE:  extension "hypopg" already exists, skipping
+ CREATE TABLE hypo (id integer, val text, "Id2" bigint);
+ INSERT INTO hypo SELECT i, 'line ' || i
+ FROM generate_series(1,100000) f(i);
+diff --git a/test/sql/hypopg.sql b/test/sql/hypopg.sql
+index 99722b0..8d6bacb 100644
+--- a/test/sql/hypopg.sql
++++ b/test/sql/hypopg.sql
+@@ -12,7 +12,7 @@ END;
+ $_$
+ LANGUAGE plpgsql;
+
+-CREATE EXTENSION hypopg;
++CREATE EXTENSION IF NOT EXISTS hypopg;
+
+ CREATE TABLE hypo (id integer, val text, "Id2" bigint);
+
diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.sh b/docker-compose/ext-src/hypopg-src/test-upgrade.sh
new file mode 100755
index 0000000000..066ac3329e
--- /dev/null
+++ b/docker-compose/ext-src/hypopg-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --inputdir=test --dbname=contrib_regression hypopg hypo_brin hypo_index_part hypo_include hypo_hash hypo_hide_index
\ No newline at end of file
diff --git a/docker-compose/ext-src/ip4r-src/test-upgrade.patch b/docker-compose/ext-src/ip4r-src/test-upgrade.patch
new file mode 100644
index 0000000000..e166052cca
--- /dev/null
+++ b/docker-compose/ext-src/ip4r-src/test-upgrade.patch
@@ -0,0 +1,23 @@
+diff --git a/expected/ip4r.out b/expected/ip4r.out
+index 7527af3..b38ed29 100644
+--- a/expected/ip4r.out
++++ b/expected/ip4r.out
+@@ -1,6 +1,5 @@
+ --
+ /*CUT-HERE*/
+-CREATE EXTENSION ip4r;
+ -- Check whether any of our opclasses fail amvalidate
+ DO $d$
+   DECLARE
+diff --git a/sql/ip4r.sql b/sql/ip4r.sql
+index 65c49ec..24ade09 100644
+--- a/sql/ip4r.sql
++++ b/sql/ip4r.sql
+@@ -1,7 +1,6 @@
+ --
+
+ /*CUT-HERE*/
+-CREATE EXTENSION ip4r;
+
+ -- Check whether any of our opclasses fail amvalidate
+
diff --git a/docker-compose/ext-src/ip4r-src/test-upgrade.sh b/docker-compose/ext-src/ip4r-src/test-upgrade.sh
new file mode 100755
index 0000000000..0463833598
--- /dev/null
+++ b/docker-compose/ext-src/ip4r-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression ip4r ip4r-softerr ip4r-v11
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_cron-src/test-upgrade.patch b/docker-compose/ext-src/pg_cron-src/test-upgrade.patch
new file mode 100644
index 0000000000..7f187be6e2
--- /dev/null
+++ b/docker-compose/ext-src/pg_cron-src/test-upgrade.patch
@@ -0,0 +1,75 @@
+diff --git a/expected/pg_cron-test.out b/expected/pg_cron-test.out
+index d79d542..1663886 100644
+--- a/expected/pg_cron-test.out
++++ b/expected/pg_cron-test.out
+@@ -1,30 +1,3 @@
+-CREATE EXTENSION pg_cron VERSION '1.0';
+-SELECT extversion FROM pg_extension WHERE extname='pg_cron';
+- extversion 
+-------------
+- 1.0
+-(1 row)
+-
+--- Test binary compatibility with v1.4 function signature.
+-ALTER EXTENSION pg_cron UPDATE TO '1.4';
+-SELECT cron.unschedule(job_name := 'no_such_job');
+-ERROR:  could not find valid entry for job 'no_such_job'
+-SELECT cron.schedule('testjob', '* * * * *', 'SELECT 1');
+- schedule 
+-----------
+-        1
+-(1 row)
+-
+-SELECT cron.unschedule('testjob');
+- unschedule 
+-------------
+- t
+-(1 row)
+-
+--- Test cache invalidation
+-DROP EXTENSION pg_cron;
+-CREATE EXTENSION pg_cron VERSION '1.4';
+-ALTER EXTENSION pg_cron UPDATE;
+ -- Vacuum every day at 10:00am (GMT)
+ SELECT cron.schedule('0 10 * * *', 'VACUUM');
+  schedule 
+@@ -300,8 +273,3 @@ SELECT jobid, jobname, schedule, command FROM cron.job ORDER BY jobid;
+ SELECT cron.schedule('bad-last-dom-job1', '0 11 $foo * *', 'VACUUM FULL');
+ ERROR:  invalid schedule: 0 11 $foo * *
+ HINT:  Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds'
+--- cleaning
+-DROP EXTENSION pg_cron;
+-drop user pgcron_cront;
+-drop database pgcron_dbno;
+-drop database pgcron_dbyes;
+diff --git a/sql/pg_cron-test.sql b/sql/pg_cron-test.sql
+index 45f94d9..241cf73 100644
+--- a/sql/pg_cron-test.sql
++++ b/sql/pg_cron-test.sql
+@@ -1,17 +1,3 @@
+-CREATE EXTENSION pg_cron VERSION '1.0';
+-SELECT extversion FROM pg_extension WHERE extname='pg_cron';
+--- Test binary compatibility with v1.4 function signature.
+-ALTER EXTENSION pg_cron UPDATE TO '1.4';
+-SELECT cron.unschedule(job_name := 'no_such_job');
+-SELECT cron.schedule('testjob', '* * * * *', 'SELECT 1');
+-SELECT cron.unschedule('testjob');
+-
+--- Test cache invalidation
+-DROP EXTENSION pg_cron;
+-CREATE EXTENSION pg_cron VERSION '1.4';
+-
+-ALTER EXTENSION pg_cron UPDATE;
+-
+ -- Vacuum every day at 10:00am (GMT)
+ SELECT cron.schedule('0 10 * * *', 'VACUUM');
+ 
+@@ -156,8 +142,3 @@ SELECT jobid, jobname, schedule, command FROM cron.job ORDER BY jobid;
+ -- invalid last of day job
+ SELECT cron.schedule('bad-last-dom-job1', '0 11 $foo * *', 'VACUUM FULL');
+ 
+--- cleaning
+-DROP EXTENSION pg_cron;
+-drop user pgcron_cront;
+-drop database pgcron_dbno;
+-drop database pgcron_dbyes;
diff --git a/docker-compose/ext-src/pg_cron-src/test-upgrade.sh b/docker-compose/ext-src/pg_cron-src/test-upgrade.sh
new file mode 100755
index 0000000000..8308b495d3
--- /dev/null
+++ b/docker-compose/ext-src/pg_cron-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression pg_cron-test
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_ivm-src/test-upgrade.patch b/docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
new file mode 100644
index 0000000000..77de3accfd
--- /dev/null
+++ b/docker-compose/ext-src/pg_ivm-src/test-upgrade.patch
@@ -0,0 +1,18 @@
+diff --git a/expected/pg_ivm.out b/expected/pg_ivm.out
+index e8798ee..cca58d0 100644
+--- a/expected/pg_ivm.out
++++ b/expected/pg_ivm.out
+@@ -1,4 +1,3 @@
+-CREATE EXTENSION pg_ivm;
+ GRANT ALL ON SCHEMA public TO public;
+ -- create a table to use as a basis for views and materialized views in various combinations
+ CREATE TABLE mv_base_a (i int, j int);
+diff --git a/sql/pg_ivm.sql b/sql/pg_ivm.sql
+index d3c1a01..9382d7f 100644
+--- a/sql/pg_ivm.sql
++++ b/sql/pg_ivm.sql
+@@ -1,4 +1,3 @@
+-CREATE EXTENSION pg_ivm;
+ GRANT ALL ON SCHEMA public TO public;
+ 
+ -- create a table to use as a basis for views and materialized views in various combinations
diff --git a/docker-compose/ext-src/pg_ivm-src/test-upgrade.sh b/docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
new file mode 100755
index 0000000000..5ece4dbf1c
--- /dev/null
+++ b/docker-compose/ext-src/pg_ivm-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression pg_ivm create_immv refresh_immv
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch
new file mode 100644
index 0000000000..ece3f31488
--- /dev/null
+++ b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.patch
@@ -0,0 +1,25 @@
+diff --git a/expected/roaringbitmap.out b/expected/roaringbitmap.out
+index de70531..a5f7c15 100644
+--- a/expected/roaringbitmap.out
++++ b/expected/roaringbitmap.out
+@@ -1,7 +1,6 @@
+ --
+ --  Test roaringbitmap extension
+ --
+-CREATE EXTENSION if not exists roaringbitmap;
+ -- Test input and output
+ set roaringbitmap.output_format='array';
+ set extra_float_digits = 0;
+diff --git a/sql/roaringbitmap.sql b/sql/roaringbitmap.sql
+index a0e9c74..84bc966 100644
+--- a/sql/roaringbitmap.sql
++++ b/sql/roaringbitmap.sql
+@@ -2,8 +2,6 @@
+ --  Test roaringbitmap extension
+ --
+ 
+-CREATE EXTENSION if not exists roaringbitmap;
+-
+ -- Test input and output
+ 
+ set roaringbitmap.output_format='array';
diff --git a/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
new file mode 100755
index 0000000000..ea1264fb28
--- /dev/null
+++ b/docker-compose/ext-src/pg_roaringbitmap-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression roaringbitmap
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.patch b/docker-compose/ext-src/pg_semver-src/test-upgrade.patch
new file mode 100644
index 0000000000..c6eab001da
--- /dev/null
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.patch
@@ -0,0 +1,24 @@
+diff --git a/test/sql/base.sql b/test/sql/base.sql
+index af599d8..2eed91b 100644
+--- a/test/sql/base.sql
++++ b/test/sql/base.sql
+@@ -2,7 +2,6 @@
+ BEGIN;
+ 
+ \i test/pgtap-core.sql
+-\i sql/semver.sql
+ 
+ SELECT plan(334);
+ --SELECT * FROM no_plan();
+diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql
+index 1f5f637..a519905 100644
+--- a/test/sql/corpus.sql
++++ b/test/sql/corpus.sql
+@@ -4,7 +4,6 @@ BEGIN;
+ -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/.
+ 
+ \i test/pgtap-core.sql
+-\i sql/semver.sql
+ 
+ SELECT plan(71);
+ --SELECT * FROM no_plan();
diff --git a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
new file mode 100755
index 0000000000..e1541f272a
--- /dev/null
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --dbname=contrib_regression base corpus
\ No newline at end of file
diff --git a/docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh b/docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
new file mode 100755
index 0000000000..bb8b589df8
--- /dev/null
+++ b/docker-compose/ext-src/pg_uuidv7-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --dbname=contrib_regression  002_uuid_generate_v7 003_uuid_v7_to_timestamptz 004_uuid_timestamptz_to_v7 005_uuid_v7_to_timestamp 006_uuid_timestamp_to_v7
\ No newline at end of file
diff --git a/docker-compose/ext-src/pgvector-src/test-upgrade.sh b/docker-compose/ext-src/pgvector-src/test-upgrade.sh
new file mode 100755
index 0000000000..8967f62173
--- /dev/null
+++ b/docker-compose/ext-src/pgvector-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --inputdir=test --use-existing --dbname=contrib_regression bit btree cast copy halfvec hnsw_bit hnsw_halfvec hnsw_sparsevec hnsw_vector ivfflat_bit ivfflat_halfvec ivfflat_vector sparsevec vector_type
\ No newline at end of file
diff --git a/docker-compose/ext-src/plv8-src/test-upgrade.sh b/docker-compose/ext-src/plv8-src/test-upgrade.sh
new file mode 100755
index 0000000000..6514d4fe92
--- /dev/null
+++ b/docker-compose/ext-src/plv8-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression plv8 plv8-errors scalar_args inline json startup_pre startup varparam json_conv jsonb_conv window guc es6 arraybuffer composites currentresource startup_perms bytea find_function_perms memory_limits reset show array_spread regression dialect bigint procedure
\ No newline at end of file
diff --git a/docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh b/docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
new file mode 100755
index 0000000000..17a89620e0
--- /dev/null
+++ b/docker-compose/ext-src/postgresql-unit-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression extension tables unit binary unicode prefix units time temperature functions language_functions round derived compare aggregate iec custom crosstab convert
\ No newline at end of file
diff --git a/docker-compose/ext-src/prefix-src/test-upgrade.sh b/docker-compose/ext-src/prefix-src/test-upgrade.sh
new file mode 100755
index 0000000000..a609df0bed
--- /dev/null
+++ b/docker-compose/ext-src/prefix-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --dbname=contrib_regression prefix falcon explain queries
\ No newline at end of file
diff --git a/docker-compose/ext-src/rum-src/test-upgrade.patch b/docker-compose/ext-src/rum-src/test-upgrade.patch
new file mode 100644
index 0000000000..1a9805254f
--- /dev/null
+++ b/docker-compose/ext-src/rum-src/test-upgrade.patch
@@ -0,0 +1,19 @@
+diff --git a/expected/rum.out b/expected/rum.out
+index 5966d19..8860b79 100644
+--- a/expected/rum.out
++++ b/expected/rum.out
+@@ -1,4 +1,3 @@
+-CREATE EXTENSION rum;
+ CREATE TABLE test_rum( t text, a tsvector );
+ CREATE TRIGGER tsvectorupdate
+ BEFORE UPDATE OR INSERT ON test_rum
+diff --git a/sql/rum.sql b/sql/rum.sql
+index 8414bb9..898e6ab 100644
+--- a/sql/rum.sql
++++ b/sql/rum.sql
+@@ -1,5 +1,3 @@
+-CREATE EXTENSION rum;
+-
+ CREATE TABLE test_rum( t text, a tsvector );
+
+ CREATE TRIGGER tsvectorupdate
\ No newline at end of file
diff --git a/docker-compose/ext-src/rum-src/test-upgrade.sh b/docker-compose/ext-src/rum-src/test-upgrade.sh
new file mode 100755
index 0000000000..6daf6d4054
--- /dev/null
+++ b/docker-compose/ext-src/rum-src/test-upgrade.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_validate rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array
\ No newline at end of file
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
new file mode 100755
index 0000000000..ff93b98065
--- /dev/null
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+set -eux -o pipefail
+cd "$(dirname "${0}")"
+# Takes a variable name as argument. The result is stored in that variable.
+generate_id() {
+    local -n resvar=$1
+    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+}
+if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then
+  echo OLDTAG and NEWTAG must be defined
+  exit 1
+fi
+export PG_VERSION=${PG_VERSION:-16}
+function wait_for_ready {
+  TIME=0
+  while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
+    ((TIME += 1 ))
+    sleep 1
+  done
+  if [ ${TIME} -gt 300 ]; then
+    echo Time is out.
+    exit 2
+  fi
+}
+function create_extensions() {
+  for ext in ${1}; do
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+  done
+}
+EXTENSIONS='[
+{"extname": "plv8", "extdir": "plv8-src"},
+{"extname": "vector", "extdir": "pgvector-src"},
+{"extname": "unit", "extdir": "postgresql-unit-src"},
+{"extname": "hypopg", "extdir": "hypopg-src"},
+{"extname": "rum", "extdir": "rum-src"},
+{"extname": "ip4r", "extdir": "ip4r-src"},
+{"extname": "prefix", "extdir": "prefix-src"},
+{"extname": "hll", "extdir": "hll-src"},
+{"extname": "pg_cron", "extdir": "pg_cron-src"},
+{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
+{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
+{"extname": "semver", "extdir": "pg_semver-src"},
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+]'
+EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
+TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
+wait_for_ready
+docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
+docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
+create_extensions "${EXTNAMES}"
+query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
+new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+docker compose --profile test-extensions down
+TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
+wait_for_ready
+docker compose cp  ext-src neon-test-extensions:/
+docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
+docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
+create_extensions "${EXTNAMES}"
+query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
+exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+if [ -z "${exts}" ]; then
+  echo "No extensions were upgraded"
+else
+  tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id")
+  timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
+  for ext in ${exts}; do
+    echo Testing ${ext}...
+    EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir')
+    generate_id new_timeline_id
+    PARAMS=(
+        -sbf
+        -X POST
+        -H "Content-Type: application/json"
+        -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}"
+        "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/"
+    )
+    result=$(curl "${PARAMS[@]}")
+    echo $result | jq .
+    TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready
+    COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready
+    wait_for_ready
+    TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id")
+    if [ ${TID} != ${new_timeline_id} ]; then
+      echo Timeline mismatch
+      exit 1
+    fi
+    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
+    docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
+    docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
+    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
+  done
+fi

From 4d2328ebe3b5f6db9a57effe86bc70614dcd9d19 Mon Sep 17 00:00:00 2001
From: alexanderlaw <exclusion@gmail.com>
Date: Wed, 29 Jan 2025 12:05:43 +0200
Subject: [PATCH 36/72] Fix C code to satisfy sanitizers (#10473)

---
 pgxn/neon/file_cache.c  | 12 ++++++------
 pgxn/neon/walproposer.c |  3 ++-
 vendor/postgres-v17     |  2 +-
 vendor/revisions.json   |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 64b236061d..08b7652175 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -480,7 +480,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	if (LFC_ENABLED())
 	{
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-		found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0;
+		found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & ((uint32)1 << (chunk_offs & 31))) != 0;
 	}
 	LWLockRelease(lfc_lock);
 	return found;
@@ -527,7 +527,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 				{
 					if ((entry->bitmap[chunk_offs >> 5] & 
-						(1 << (chunk_offs & 31))) != 0)
+						((uint32)1 << (chunk_offs & 31))) != 0)
 					{
 						BITMAP_SET(bitmap, i);
 						found++;
@@ -620,7 +620,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	}
 
 	/* remove the page from the cache */
-	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
+	entry->bitmap[chunk_offs >> 5] &= ~((uint32)1 << (chunk_offs & (32 - 1)));
 
 	if (entry->access_count == 0)
 	{
@@ -774,7 +774,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			 * If the page is valid, we consider it "read".
 			 * All other pages will be fetched separately by the next cache
 			 */
-			if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32)))
+			if (entry->bitmap[(chunk_offs + i) / 32] & ((uint32)1 << ((chunk_offs + i) % 32)))
 			{
 				BITMAP_SET(mask, buf_offset + i);
 				iteration_hits++;
@@ -1034,7 +1034,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				{
 					lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
 					entry->bitmap[(chunk_offs + i) >> 5] |=
-						(1 << ((chunk_offs + i) & 31));
+						((uint32)1 << ((chunk_offs + i) & 31));
 				}
 			}
 
@@ -1282,7 +1282,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			{
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (entry->bitmap[i >> 5] & (1 << (i & 31)))
+					if (entry->bitmap[i >> 5] & ((uint32)1 << (i & 31)))
 					{
 						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
 						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index e89ffdb628..7472fd6afc 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1024,7 +1024,8 @@ DetermineEpochStartLsn(WalProposer *wp)
 	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
-	memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
+	if (dth->n_entries > 0)
+		memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
 
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 46f9b96555..b654fa88b6 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 46f9b96555e084c35dd975da9485996db9e86181
+Subproject commit b654fa88b6fd2ad24a03a14a7cd417ec66e518f9
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3aa42d22c5..982f537692 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "46f9b96555e084c35dd975da9485996db9e86181"
+    "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9"
   ],
   "v16": [
     "16.6",

From 222cc181e9314fe6c7596f06d677a92875f4aa8b Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Wed, 29 Jan 2025 13:19:10 +0200
Subject: [PATCH 37/72] impr(proxy): Move the CancelMap to Redis hashes 
 (#10364)

## Problem
The approach of having CancelMap as an in-memory structure increases
code complexity,
as well as putting additional load for Redis streams.

## Summary of changes
- Implement a set of KV ops for Redis client;
- Remove cancel notifications code;
- Send KV ops over the bounded channel to the handling background task
for removing and adding the cancel keys.


Closes #9660
---
 Cargo.lock                                    |   1 +
 libs/pq_proto/src/lib.rs                      |   7 +
 libs/proxy/tokio-postgres2/Cargo.toml         |   1 +
 .../proxy/tokio-postgres2/src/cancel_token.rs |   3 +-
 libs/proxy/tokio-postgres2/src/client.rs      |   3 +-
 libs/proxy/tokio-postgres2/src/config.rs      |   5 +-
 proxy/src/auth/backend/mod.rs                 |   3 +-
 proxy/src/bin/local_proxy.rs                  |  10 +-
 proxy/src/bin/proxy.rs                        |  46 +-
 proxy/src/cancellation.rs                     | 563 +++++++++---------
 proxy/src/compute.rs                          |   1 -
 proxy/src/console_redirect_proxy.rs           |  29 +-
 proxy/src/metrics.rs                          |  30 +-
 proxy/src/proxy/mod.rs                        |  39 +-
 proxy/src/proxy/passthrough.rs                |   9 +-
 proxy/src/rate_limiter/limiter.rs             |   6 +
 proxy/src/redis/cancellation_publisher.rs     |  72 +--
 .../connection_with_credentials_provider.rs   |   1 +
 proxy/src/redis/keys.rs                       |  88 +++
 proxy/src/redis/kv_ops.rs                     | 185 ++++++
 proxy/src/redis/mod.rs                        |   2 +
 proxy/src/redis/notifications.rs              | 105 +---
 proxy/src/serverless/mod.rs                   |   8 +-
 proxy/src/serverless/websocket.rs             |   4 +-
 24 files changed, 674 insertions(+), 547 deletions(-)
 create mode 100644 proxy/src/redis/keys.rs
 create mode 100644 proxy/src/redis/kv_ops.rs

diff --git a/Cargo.lock b/Cargo.lock
index 3c33901247..c19fdc0941 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6935,6 +6935,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol2",
  "postgres-types2",
+ "serde",
  "tokio",
  "tokio-util",
 ]
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 50b2c69d24..f99128b76a 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -182,6 +182,13 @@ pub struct CancelKeyData {
     pub cancel_key: i32,
 }
 
+pub fn id_to_cancel_key(id: u64) -> CancelKeyData {
+    CancelKeyData {
+        backend_pid: (id >> 32) as i32,
+        cancel_key: (id & 0xffffffff) as i32,
+    }
+}
+
 impl fmt::Display for CancelKeyData {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let hi = (self.backend_pid as u64) << 32;
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
index 56e7c4da47..ade0ffc9f6 100644
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -19,3 +19,4 @@ postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
 tokio-util = { workspace = true, features = ["codec"] }
+serde = { workspace = true, features = ["derive"] }
\ No newline at end of file
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
index a10e8bf5c3..718f903a92 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -3,12 +3,13 @@ use crate::tls::TlsConnect;
 
 use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect};
 use crate::{cancel_query_raw, Error};
+use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpStream;
 
 /// The capability to request cancellation of in-progress queries on a
 /// connection.
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelToken {
     pub socket_config: Option<SocketConfig>,
     pub ssl_mode: SslMode,
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index a7cd53afc3..9bbbd4c260 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -18,6 +18,7 @@ use fallible_iterator::FallibleIterator;
 use futures_util::{future, ready, TryStreamExt};
 use parking_lot::Mutex;
 use postgres_protocol2::message::{backend::Message, frontend};
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
@@ -137,7 +138,7 @@ impl InnerClient {
     }
 }
 
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SocketConfig {
     pub host: Host,
     pub port: u16,
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 11a361a81b..47cc45ac80 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -7,6 +7,7 @@ use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
 use postgres_protocol2::message::frontend::StartupMessageParams;
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::str;
 use std::time::Duration;
@@ -16,7 +17,7 @@ pub use postgres_protocol2::authentication::sasl::ScramKeys;
 use tokio::net::TcpStream;
 
 /// TLS configuration.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
 #[non_exhaustive]
 pub enum SslMode {
     /// Do not use TLS.
@@ -50,7 +51,7 @@ pub enum ReplicationMode {
 }
 
 /// A host specification.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum Host {
     /// A TCP hostname.
     Tcp(String),
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index de48be2952..d17d91a56d 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -12,6 +12,7 @@ pub(crate) use console_redirect::ConsoleRedirectError;
 use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
 use postgres_client::config::AuthKeys;
+use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, warn};
 
@@ -133,7 +134,7 @@ pub(crate) struct ComputeUserInfoNoEndpoint {
     pub(crate) options: NeonOptions,
 }
 
-#[derive(Debug, Clone, Default)]
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub(crate) struct ComputeUserInfo {
     pub(crate) endpoint: EndpointId,
     pub(crate) user: RoleName,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 644f670f88..ee8b3d4ef5 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -7,12 +7,11 @@ use std::time::Duration;
 use anyhow::{bail, ensure, Context};
 use camino::{Utf8Path, Utf8PathBuf};
 use compute_api::spec::LocalProxySpec;
-use dashmap::DashMap;
 use futures::future::Either;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
 use proxy::auth::{self};
-use proxy::cancellation::CancellationHandlerMain;
+use proxy::cancellation::CancellationHandler;
 use proxy::config::{
     self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
 };
@@ -211,12 +210,7 @@ async fn main() -> anyhow::Result<()> {
         auth_backend,
         http_listener,
         shutdown.clone(),
-        Arc::new(CancellationHandlerMain::new(
-            &config.connect_to_compute,
-            Arc::new(DashMap::new()),
-            None,
-            proxy::metrics::CancellationSource::Local,
-        )),
+        Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
         endpoint_rate_limiter,
     );
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 70b50436bf..e1affe8391 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,7 +7,7 @@ use anyhow::bail;
 use futures::future::Either;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
-use proxy::cancellation::{CancelMap, CancellationHandler};
+use proxy::cancellation::{handle_cancel_messages, CancellationHandler};
 use proxy::config::{
     self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig,
     ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
@@ -18,8 +18,8 @@ use proxy::metrics::Metrics;
 use proxy::rate_limiter::{
     EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
 };
-use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use proxy::redis::kv_ops::RedisKVClient;
 use proxy::redis::{elasticache, notifications};
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
@@ -28,7 +28,6 @@ use proxy::tls::client_config::compute_client_config_with_root_certs;
 use proxy::{auth, control_plane, http, serverless, usage_metrics};
 use remote_storage::RemoteStorageConfig;
 use tokio::net::TcpListener;
-use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn, Instrument};
@@ -158,8 +157,11 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = 64)]
     auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
+    /// Cancellation channel size (max queue size for redis kv client)
+    #[clap(long, default_value = "1024")]
+    cancellation_ch_size: usize,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
@@ -382,27 +384,19 @@ async fn main() -> anyhow::Result<()> {
 
     let cancellation_token = CancellationToken::new();
 
-    let cancel_map = CancelMap::default();
-
     let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone());
     RateBucketInfo::validate(redis_rps_limit)?;
 
-    let redis_publisher = match &regional_redis_client {
-        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
-            redis_publisher.clone(),
-            args.region.clone(),
-            redis_rps_limit,
-        )?))),
-        None => None,
-    };
+    let redis_kv_client = regional_redis_client
+        .as_ref()
+        .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
 
-    let cancellation_handler = Arc::new(CancellationHandler::<
-        Option<Arc<Mutex<RedisPublisherClient>>>,
-    >::new(
+    // channel size should be higher than redis client limit to avoid blocking
+    let cancel_ch_size = args.cancellation_ch_size;
+    let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
+    let cancellation_handler = Arc::new(CancellationHandler::new(
         &config.connect_to_compute,
-        cancel_map.clone(),
-        redis_publisher,
-        proxy::metrics::CancellationSource::FromClient,
+        Some(tx_cancel),
     ));
 
     // bit of a hack - find the min rps and max rps supported and turn it into
@@ -495,25 +489,29 @@ async fn main() -> anyhow::Result<()> {
                     let cache = api.caches.project_info.clone();
                     if let Some(client) = client1 {
                         maintenance_tasks.spawn(notifications::task_main(
-                            config,
                             client,
                             cache.clone(),
-                            cancel_map.clone(),
                             args.region.clone(),
                         ));
                     }
                     if let Some(client) = client2 {
                         maintenance_tasks.spawn(notifications::task_main(
-                            config,
                             client,
                             cache.clone(),
-                            cancel_map.clone(),
                             args.region.clone(),
                         ));
                     }
                     maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
                 }
             }
+
+            if let Some(mut redis_kv_client) = redis_kv_client {
+                maintenance_tasks.spawn(async move {
+                    redis_kv_client.try_connect().await?;
+                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
+                });
+            }
+
             if let Some(regional_redis_client) = regional_redis_client {
                 let cache = api.caches.endpoints_cache.clone();
                 let con = regional_redis_client;
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index a96c43f2ce..34f708a36b 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,48 +1,124 @@
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
-use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::tls::MakeTlsConnect;
 use postgres_client::CancelToken;
 use pq_proto::CancelKeyData;
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::sync::Mutex;
+use tokio::sync::mpsc;
 use tracing::{debug, info};
-use uuid::Uuid;
 
 use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
-use crate::auth::{check_peer_addr_is_in_list, AuthError, IpPattern};
+use crate::auth::{check_peer_addr_is_in_list, AuthError};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
-use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
+use crate::metrics::CancelChannelSizeGuard;
+use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind};
 use crate::rate_limiter::LeakyBucketRateLimiter;
-use crate::redis::cancellation_publisher::{
-    CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
-};
+use crate::redis::keys::KeyPrefix;
+use crate::redis::kv_ops::RedisKVClient;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
-
-pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
-pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
-pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
+use std::convert::Infallible;
+use tokio::sync::oneshot;
 
 type IpSubnetKey = IpNet;
 
+const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
+const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
+
+// Message types for sending through mpsc channel
+pub enum CancelKeyOp {
+    StoreCancelKey {
+        key: String,
+        field: String,
+        value: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+        expire: i64, // TTL for key
+    },
+    GetCancelData {
+        key: String,
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        key: String,
+        field: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+}
+
+// Running as a separate task to accept messages through the rx channel
+// In case of problems with RTT: switch to recv_many() + redis pipeline
+pub async fn handle_cancel_messages(
+    client: &mut RedisKVClient,
+    mut rx: mpsc::Receiver<CancelKeyOp>,
+) -> anyhow::Result<Infallible> {
+    loop {
+        if let Some(msg) = rx.recv().await {
+            match msg {
+                CancelKeyOp::StoreCancelKey {
+                    key,
+                    field,
+                    value,
+                    resp_tx,
+                    _guard,
+                    expire: _,
+                } => {
+                    if let Some(resp_tx) = resp_tx {
+                        resp_tx
+                            .send(client.hset(key, field, value).await)
+                            .inspect_err(|e| {
+                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
+                            })
+                            .ok();
+                    } else {
+                        drop(client.hset(key, field, value).await);
+                    }
+                }
+                CancelKeyOp::GetCancelData {
+                    key,
+                    resp_tx,
+                    _guard,
+                } => {
+                    drop(resp_tx.send(client.hget_all(key).await));
+                }
+                CancelKeyOp::RemoveCancelKey {
+                    key,
+                    field,
+                    resp_tx,
+                    _guard,
+                } => {
+                    if let Some(resp_tx) = resp_tx {
+                        resp_tx
+                            .send(client.hdel(key, field).await)
+                            .inspect_err(|e| {
+                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
+                            })
+                            .ok();
+                    } else {
+                        drop(client.hdel(key, field).await);
+                    }
+                }
+            }
+        }
+    }
+}
+
 /// Enables serving `CancelRequest`s.
 ///
 /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
-pub struct CancellationHandler<P> {
+pub struct CancellationHandler {
     compute_config: &'static ComputeConfig,
-    map: CancelMap,
-    client: P,
-    /// This field used for the monitoring purposes.
-    /// Represents the source of the cancellation request.
-    from: CancellationSource,
     // rate limiter of cancellation requests
     limiter: Arc<std::sync::Mutex<LeakyBucketRateLimiter<IpSubnetKey>>>,
+    tx: Option<mpsc::Sender<CancelKeyOp>>, // send messages to the redis KV client task
 }
 
 #[derive(Debug, Error)]
@@ -61,6 +137,12 @@ pub(crate) enum CancelError {
 
     #[error("Authentication backend error")]
     AuthError(#[from] AuthError),
+
+    #[error("key not found")]
+    NotFound,
+
+    #[error("proxy service error")]
+    InternalError,
 }
 
 impl ReportableError for CancelError {
@@ -73,274 +155,191 @@ impl ReportableError for CancelError {
             CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
             CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
             CancelError::IpNotAllowed => crate::error::ErrorKind::User,
+            CancelError::NotFound => crate::error::ErrorKind::User,
             CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
+            CancelError::InternalError => crate::error::ErrorKind::Service,
         }
     }
 }
 
-impl<P: CancellationPublisher> CancellationHandler<P> {
-    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
+impl CancellationHandler {
+    pub fn new(
+        compute_config: &'static ComputeConfig,
+        tx: Option<mpsc::Sender<CancelKeyOp>>,
+    ) -> Self {
+        Self {
+            compute_config,
+            tx,
+            limiter: Arc::new(std::sync::Mutex::new(
+                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
+                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
+                    64,
+                ),
+            )),
+        }
+    }
+
+    pub(crate) fn get_key(self: &Arc<Self>) -> Session {
         // we intentionally generate a random "backend pid" and "secret key" here.
         // we use the corresponding u64 as an identifier for the
         // actual endpoint+pid+secret for postgres/pgbouncer.
         //
         // if we forwarded the backend_pid from postgres to the client, there would be a lot
         // of overlap between our computes as most pids are small (~100).
-        let key = loop {
-            let key = rand::random();
 
-            // Random key collisions are unlikely to happen here, but they're still possible,
-            // which is why we have to take care not to rewrite an existing key.
-            match self.map.entry(key) {
-                dashmap::mapref::entry::Entry::Occupied(_) => continue,
-                dashmap::mapref::entry::Entry::Vacant(e) => {
-                    e.insert(None);
-                }
-            }
-            break key;
-        };
+        let key: CancelKeyData = rand::random();
+
+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();
 
         debug!("registered new query cancellation key {key}");
         Session {
             key,
-            cancellation_handler: self,
+            redis_key,
+            cancellation_handler: Arc::clone(self),
         }
     }
 
-    /// Cancelling only in notification, will be removed
-    pub(crate) async fn cancel_session(
+    async fn get_cancel_key(
         &self,
         key: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-        check_allowed: bool,
-    ) -> Result<(), CancelError> {
-        // TODO: check for unspecified address is only for backward compatibility, should be removed
-        if !peer_addr.is_unspecified() {
-            let subnet_key = match peer_addr {
-                IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
-                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
-            };
-            if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
-                // log only the subnet part of the IP address to know which subnet is rate limited
-                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
-                Metrics::get()
-                    .proxy
-                    .cancellation_requests_total
-                    .inc(CancellationRequest {
-                        source: self.from,
-                        kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
-                    });
-                return Err(CancelError::RateLimit);
-            }
-        }
+    ) -> Result<Option<CancelClosure>, CancelError> {
+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();
 
-        // NB: we should immediately release the lock after cloning the token.
-        let cancel_state = self.map.get(&key).and_then(|x| x.clone());
-        let Some(cancel_closure) = cancel_state else {
-            tracing::warn!("query cancellation key not found: {key}");
-            Metrics::get()
+        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
+        let op = CancelKeyOp::GetCancelData {
+            key: redis_key,
+            resp_tx,
+            _guard: Metrics::get()
                 .proxy
-                .cancellation_requests_total
-                .inc(CancellationRequest {
-                    source: self.from,
-                    kind: crate::metrics::CancellationOutcome::NotFound,
-                });
-
-            if session_id == Uuid::nil() {
-                // was already published, do not publish it again
-                return Ok(());
-            }
-
-            match self.client.try_publish(key, session_id, peer_addr).await {
-                Ok(()) => {} // do nothing
-                Err(e) => {
-                    // log it here since cancel_session could be spawned in a task
-                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
-                    return Err(CancelError::IO(std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        e.to_string(),
-                    )));
-                }
-            }
-            return Ok(());
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGetAll),
         };
 
-        if check_allowed
-            && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice())
-        {
-            // log it here since cancel_session could be spawned in a task
-            tracing::warn!("IP is not allowed to cancel the query: {key}");
-            return Err(CancelError::IpNotAllowed);
-        }
+        let Some(tx) = &self.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
+        };
 
-        Metrics::get()
-            .proxy
-            .cancellation_requests_total
-            .inc(CancellationRequest {
-                source: self.from,
-                kind: crate::metrics::CancellationOutcome::Found,
-            });
-        info!(
-            "cancelling query per user's request using key {key}, hostname {}, address: {}",
-            cancel_closure.hostname, cancel_closure.socket_addr
-        );
-        cancel_closure.try_cancel_query(self.compute_config).await
+        tx.send_timeout(op, REDIS_SEND_TIMEOUT)
+            .await
+            .map_err(|e| {
+                tracing::warn!("failed to send GetCancelData for {key}: {e}");
+            })
+            .map_err(|()| CancelError::InternalError)?;
+
+        let result = resp_rx.await.map_err(|e| {
+            tracing::warn!("failed to receive GetCancelData response: {e}");
+            CancelError::InternalError
+        })?;
+
+        let cancel_state_str: Option<String> = match result {
+            Ok(mut state) => {
+                if state.len() == 1 {
+                    Some(state.remove(0).1)
+                } else {
+                    tracing::warn!("unexpected number of entries in cancel state: {state:?}");
+                    return Err(CancelError::InternalError);
+                }
+            }
+            Err(e) => {
+                tracing::warn!("failed to receive cancel state from redis: {e}");
+                return Err(CancelError::InternalError);
+            }
+        };
+
+        let cancel_state: Option<CancelClosure> = match cancel_state_str {
+            Some(state) => {
+                let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| {
+                    tracing::warn!("failed to deserialize cancel state: {e}");
+                    CancelError::InternalError
+                })?;
+                Some(cancel_closure)
+            }
+            None => None,
+        };
+        Ok(cancel_state)
     }
-
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
     /// check_allowed - if true, check if the IP is allowed to cancel the query.
     /// Will fetch IP allowlist internally.
     ///
     /// return Result primarily for tests
-    pub(crate) async fn cancel_session_auth<T: BackendIpAllowlist>(
+    pub(crate) async fn cancel_session<T: BackendIpAllowlist>(
         &self,
         key: CancelKeyData,
         ctx: RequestContext,
         check_allowed: bool,
         auth_backend: &T,
     ) -> Result<(), CancelError> {
-        // TODO: check for unspecified address is only for backward compatibility, should be removed
-        if !ctx.peer_addr().is_unspecified() {
-            let subnet_key = match ctx.peer_addr() {
-                IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
-                IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
-            };
-            if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
-                // log only the subnet part of the IP address to know which subnet is rate limited
-                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
-                Metrics::get()
-                    .proxy
-                    .cancellation_requests_total
-                    .inc(CancellationRequest {
-                        source: self.from,
-                        kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
-                    });
-                return Err(CancelError::RateLimit);
-            }
+        let subnet_key = match ctx.peer_addr() {
+            IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
+            IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
+        };
+        if !self.limiter.lock_propagate_poison().check(subnet_key, 1) {
+            // log only the subnet part of the IP address to know which subnet is rate limited
+            tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
+            Metrics::get()
+                .proxy
+                .cancellation_requests_total
+                .inc(CancellationRequest {
+                    kind: crate::metrics::CancellationOutcome::RateLimitExceeded,
+                });
+            return Err(CancelError::RateLimit);
         }
 
-        // NB: we should immediately release the lock after cloning the token.
-        let cancel_state = self.map.get(&key).and_then(|x| x.clone());
+        let cancel_state = self.get_cancel_key(key).await.map_err(|e| {
+            tracing::warn!("failed to receive RedisOp response: {e}");
+            CancelError::InternalError
+        })?;
+
         let Some(cancel_closure) = cancel_state else {
             tracing::warn!("query cancellation key not found: {key}");
             Metrics::get()
                 .proxy
                 .cancellation_requests_total
                 .inc(CancellationRequest {
-                    source: self.from,
                     kind: crate::metrics::CancellationOutcome::NotFound,
                 });
-
-            if ctx.session_id() == Uuid::nil() {
-                // was already published, do not publish it again
-                return Ok(());
-            }
-
-            match self
-                .client
-                .try_publish(key, ctx.session_id(), ctx.peer_addr())
-                .await
-            {
-                Ok(()) => {} // do nothing
-                Err(e) => {
-                    // log it here since cancel_session could be spawned in a task
-                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
-                    return Err(CancelError::IO(std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        e.to_string(),
-                    )));
-                }
-            }
-            return Ok(());
+            return Err(CancelError::NotFound);
         };
 
-        let ip_allowlist = auth_backend
-            .get_allowed_ips(&ctx, &cancel_closure.user_info)
-            .await
-            .map_err(CancelError::AuthError)?;
+        if check_allowed {
+            let ip_allowlist = auth_backend
+                .get_allowed_ips(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(CancelError::AuthError)?;
 
-        if check_allowed && !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
-            // log it here since cancel_session could be spawned in a task
-            tracing::warn!("IP is not allowed to cancel the query: {key}");
-            return Err(CancelError::IpNotAllowed);
+            if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
+                // log it here since cancel_session could be spawned in a task
+                tracing::warn!(
+                    "IP is not allowed to cancel the query: {key}, address: {}",
+                    ctx.peer_addr()
+                );
+                return Err(CancelError::IpNotAllowed);
+            }
         }
 
         Metrics::get()
             .proxy
             .cancellation_requests_total
             .inc(CancellationRequest {
-                source: self.from,
                 kind: crate::metrics::CancellationOutcome::Found,
             });
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query(self.compute_config).await
     }
-
-    #[cfg(test)]
-    fn contains(&self, session: &Session<P>) -> bool {
-        self.map.contains_key(&session.key)
-    }
-
-    #[cfg(test)]
-    fn is_empty(&self) -> bool {
-        self.map.is_empty()
-    }
-}
-
-impl CancellationHandler<()> {
-    pub fn new(
-        compute_config: &'static ComputeConfig,
-        map: CancelMap,
-        from: CancellationSource,
-    ) -> Self {
-        Self {
-            compute_config,
-            map,
-            client: (),
-            from,
-            limiter: Arc::new(std::sync::Mutex::new(
-                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
-                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
-                    64,
-                ),
-            )),
-        }
-    }
-}
-
-impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
-    pub fn new(
-        compute_config: &'static ComputeConfig,
-        map: CancelMap,
-        client: Option<Arc<Mutex<P>>>,
-        from: CancellationSource,
-    ) -> Self {
-        Self {
-            compute_config,
-            map,
-            client,
-            from,
-            limiter: Arc::new(std::sync::Mutex::new(
-                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
-                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
-                    64,
-                ),
-            )),
-        }
-    }
 }
 
 /// This should've been a [`std::future::Future`], but
 /// it's impossible to name a type of an unboxed future
 /// (we'd need something like `#![feature(type_alias_impl_trait)]`).
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelClosure {
     socket_addr: SocketAddr,
     cancel_token: CancelToken,
-    ip_allowlist: Vec<IpPattern>,
     hostname: String, // for pg_sni router
     user_info: ComputeUserInfo,
 }
@@ -349,14 +348,12 @@ impl CancelClosure {
     pub(crate) fn new(
         socket_addr: SocketAddr,
         cancel_token: CancelToken,
-        ip_allowlist: Vec<IpPattern>,
         hostname: String,
         user_info: ComputeUserInfo,
     ) -> Self {
         Self {
             socket_addr,
             cancel_token,
-            ip_allowlist,
             hostname,
             user_info,
         }
@@ -385,99 +382,75 @@ impl CancelClosure {
         debug!("query was cancelled");
         Ok(())
     }
-
-    /// Obsolete (will be removed after moving CancelMap to Redis), only for notifications
-    pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec<IpPattern>) {
-        self.ip_allowlist = ip_allowlist;
-    }
 }
 
 /// Helper for registering query cancellation tokens.
-pub(crate) struct Session<P> {
+pub(crate) struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
-    /// The [`CancelMap`] this session belongs to.
-    cancellation_handler: Arc<CancellationHandler<P>>,
+    redis_key: String,
+    cancellation_handler: Arc<CancellationHandler>,
 }
 
-impl<P> Session<P> {
-    /// Store the cancel token for the given session.
-    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
-        debug!("enabling query cancellation for this session");
-        self.cancellation_handler
-            .map
-            .insert(self.key, Some(cancel_closure));
-
-        self.key
+impl Session {
+    pub(crate) fn key(&self) -> &CancelKeyData {
+        &self.key
     }
-}
 
-impl<P> Drop for Session<P> {
-    fn drop(&mut self) {
-        self.cancellation_handler.map.remove(&self.key);
-        debug!("dropped query cancellation key {}", &self.key);
-    }
-}
-
-#[cfg(test)]
-#[expect(clippy::unwrap_used)]
-mod tests {
-    use std::time::Duration;
-
-    use super::*;
-    use crate::config::RetryConfig;
-    use crate::tls::client_config::compute_client_config_with_certs;
-
-    fn config() -> ComputeConfig {
-        let retry = RetryConfig {
-            base_delay: Duration::from_secs(1),
-            max_retries: 5,
-            backoff_factor: 2.0,
+    // Send the store key op to the cancellation handler
+    pub(crate) async fn write_cancel_key(
+        &self,
+        cancel_closure: CancelClosure,
+    ) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
         };
 
-        ComputeConfig {
-            retry,
-            tls: Arc::new(compute_client_config_with_certs(std::iter::empty())),
-            timeout: Duration::from_secs(2),
-        }
-    }
+        let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| {
+            tracing::warn!("failed to serialize cancel closure: {e}");
+            CancelError::InternalError
+        })?;
 
-    #[tokio::test]
-    async fn check_session_drop() -> anyhow::Result<()> {
-        let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
-            Box::leak(Box::new(config())),
-            CancelMap::default(),
-            CancellationSource::FromRedis,
-        ));
-
-        let session = cancellation_handler.clone().get_session();
-        assert!(cancellation_handler.contains(&session));
-        drop(session);
-        // Check that the session has been dropped.
-        assert!(cancellation_handler.is_empty());
+        let op = CancelKeyOp::StoreCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            value: closure_json,
+            resp_tx: None,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HSet),
+            expire: CANCEL_KEY_TTL,
+        };
 
+        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
+        });
         Ok(())
     }
 
-    #[tokio::test]
-    async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(
-            Box::leak(Box::new(config())),
-            CancelMap::default(),
-            CancellationSource::Local,
-        );
-        handler
-            .cancel_session(
-                CancelKeyData {
-                    backend_pid: 0,
-                    cancel_key: 0,
-                },
-                Uuid::new_v4(),
-                "127.0.0.1".parse().unwrap(),
-                true,
-            )
-            .await
-            .unwrap();
+    pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
+        };
+
+        let op = CancelKeyOp::RemoveCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            resp_tx: None,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HSet),
+        };
+
+        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
+        });
+        Ok(())
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index aff796bbab..d71465765f 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -296,7 +296,6 @@ impl ConnCfg {
                 process_id,
                 secret_key,
             },
-            vec![], // TODO: deprecated, will be removed
             host.to_string(),
             user_info,
         );
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 0c6755063f..78bfb6deac 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -6,7 +6,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, Instrument};
 
 use crate::auth::backend::ConsoleRedirectBackend;
-use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::cancellation::CancellationHandler;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
@@ -24,7 +24,7 @@ pub async fn task_main(
     backend: &'static ConsoleRedirectBackend,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -140,15 +140,16 @@ pub async fn task_main(
     Ok(())
 }
 
+#[allow(clippy::too_many_arguments)]
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     backend: &'static ConsoleRedirectBackend,
     ctx: &RequestContext,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
     cancellations: tokio_util::task::task_tracker::TaskTracker,
-) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
+) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
@@ -171,13 +172,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         HandshakeData::Cancel(cancel_key_data) => {
             // spawn a task to cancel the session, but don't wait for it
             cancellations.spawn({
-                let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                let cancellation_handler_clone  = Arc::clone(&cancellation_handler);
                 let ctx = ctx.clone();
                 let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id());
                 cancel_span.follows_from(tracing::Span::current());
                 async move {
                     cancellation_handler_clone
-                        .cancel_session_auth(
+                        .cancel_session(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
@@ -195,7 +196,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     ctx.set_db_options(params.clone());
 
-    let (node_info, user_info, ip_allowlist) = match backend
+    let (node_info, user_info, _ip_allowlist) = match backend
         .authenticate(ctx, &config.authentication_config, &mut stream)
         .await
     {
@@ -220,10 +221,14 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    node.cancel_closure
-        .set_ip_allowlist(ip_allowlist.unwrap_or_default());
-    let session = cancellation_handler.get_session();
-    prepare_client_connection(&node, &session, &mut stream).await?;
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();
+
+    session
+        .write_cancel_key(node.cancel_closure.clone())
+        .await?;
+
+    prepare_client_connection(&node, *session.key(), &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
     // PqStream input buffer. Normally there is none, but our serverless npm
@@ -237,8 +242,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         aux: node.aux.clone(),
         compute: node,
         session_id: ctx.session_id(),
+        cancel: session,
         _req: request_gauge,
         _conn: conn_gauge,
-        _cancel: session,
     }))
 }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 659c57c865..f3d281a26b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -56,6 +56,8 @@ pub struct ProxyMetrics {
     pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
     #[metric(flatten)]
     pub http_endpoint_pools: HttpEndpointPools,
+    #[metric(flatten)]
+    pub cancel_channel_size: CounterPairVec<CancelChannelSizeGauge>,
 
     /// Time it took for proxy to establish a connection to the compute endpoint.
     // largest bucket = 2^16 * 0.5ms = 32s
@@ -294,6 +296,16 @@ impl CounterPairAssoc for NumConnectionRequestsGauge {
 pub type NumConnectionRequestsGuard<'a> =
     metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
 
+pub struct CancelChannelSizeGauge;
+impl CounterPairAssoc for CancelChannelSizeGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_msgs_cancel_channel_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_msgs_cancel_channel_total");
+    const INC_HELP: &'static str = "Number of processing messages in the cancellation channel.";
+    const DEC_HELP: &'static str = "Number of closed messages in the cancellation channel.";
+    type LabelGroupSet = StaticLabelSet<RedisMsgKind>;
+}
+pub type CancelChannelSizeGuard<'a> = metrics::MeasuredCounterPairGuard<'a, CancelChannelSizeGauge>;
+
 #[derive(LabelGroup)]
 #[label(set = ComputeConnectionLatencySet)]
 pub struct ComputeConnectionLatencyGroup {
@@ -340,13 +352,6 @@ pub struct RedisErrors<'a> {
     pub channel: &'a str,
 }
 
-#[derive(FixedCardinalityLabel, Copy, Clone)]
-pub enum CancellationSource {
-    FromClient,
-    FromRedis,
-    Local,
-}
-
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 pub enum CancellationOutcome {
     NotFound,
@@ -357,7 +362,6 @@ pub enum CancellationOutcome {
 #[derive(LabelGroup)]
 #[label(set = CancellationRequestSet)]
 pub struct CancellationRequest {
-    pub source: CancellationSource,
     pub kind: CancellationOutcome,
 }
 
@@ -369,6 +373,16 @@ pub enum Waiting {
     RetryTimeout,
 }
 
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum RedisMsgKind {
+    HSet,
+    HSetMultiple,
+    HGet,
+    HGetAll,
+    HDel,
+}
+
 #[derive(Default)]
 struct Accumulated {
     cplane: time::Duration,
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 63f93f0a91..ab173bd0d0 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -13,8 +13,9 @@ pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pq_proto::{BeMessage as Be, StartupMessageParams};
+use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams};
 use regex::Regex;
+use serde::{Deserialize, Serialize};
 use smol_str::{format_smolstr, SmolStr};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
@@ -23,7 +24,7 @@ use tracing::{debug, error, info, warn, Instrument};
 
 use self::connect_compute::{connect_to_compute, TcpMechanism};
 use self::passthrough::ProxyPassthrough;
-use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::cancellation::{self, CancellationHandler};
 use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
@@ -57,7 +58,7 @@ pub async fn task_main(
     auth_backend: &'static auth::Backend<'static, ()>,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -243,13 +244,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     auth_backend: &'static auth::Backend<'static, ()>,
     ctx: &RequestContext,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
     cancellations: tokio_util::task::task_tracker::TaskTracker,
-) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
+) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
@@ -278,7 +279,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                 cancel_span.follows_from(tracing::Span::current());
                 async move {
                     cancellation_handler_clone
-                        .cancel_session_auth(
+                        .cancel_session(
                             cancel_key_data,
                             ctx,
                             config.authentication_config.ip_allowlist_check_enabled,
@@ -312,7 +313,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     };
 
     let user = user_info.get_user().to_owned();
-    let (user_info, ip_allowlist) = match user_info
+    let (user_info, _ip_allowlist) = match user_info
         .authenticate(
             ctx,
             &mut stream,
@@ -356,10 +357,14 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    node.cancel_closure
-        .set_ip_allowlist(ip_allowlist.unwrap_or_default());
-    let session = cancellation_handler.get_session();
-    prepare_client_connection(&node, &session, &mut stream).await?;
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();
+
+    session
+        .write_cancel_key(node.cancel_closure.clone())
+        .await?;
+
+    prepare_client_connection(&node, *session.key(), &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
     // PqStream input buffer. Normally there is none, but our serverless npm
@@ -373,23 +378,19 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         aux: node.aux.clone(),
         compute: node,
         session_id: ctx.session_id(),
+        cancel: session,
         _req: request_gauge,
         _conn: conn_gauge,
-        _cancel: session,
     }))
 }
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
-pub(crate) async fn prepare_client_connection<P>(
+pub(crate) async fn prepare_client_connection(
     node: &compute::PostgresConnection,
-    session: &cancellation::Session<P>,
+    cancel_key_data: CancelKeyData,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> Result<(), std::io::Error> {
-    // Register compute's query cancellation token and produce a new, unique one.
-    // The new token (cancel_key_data) will be sent to the client.
-    let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
-
     // Forward all deferred notices to the client.
     for notice in &node.delayed_notice {
         stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?;
@@ -411,7 +412,7 @@ pub(crate) async fn prepare_client_connection<P>(
     Ok(())
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Default)]
+#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
 pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
 impl NeonOptions {
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index a42f9aad39..08871380d6 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -56,18 +56,18 @@ pub(crate) async fn proxy_pass(
     Ok(())
 }
 
-pub(crate) struct ProxyPassthrough<P, S> {
+pub(crate) struct ProxyPassthrough<S> {
     pub(crate) client: Stream<S>,
     pub(crate) compute: PostgresConnection,
     pub(crate) aux: MetricsAuxInfo,
     pub(crate) session_id: uuid::Uuid,
+    pub(crate) cancel: cancellation::Session,
 
     pub(crate) _req: NumConnectionRequestsGuard<'static>,
     pub(crate) _conn: NumClientConnectionsGuard<'static>,
-    pub(crate) _cancel: cancellation::Session<P>,
 }
 
-impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
+impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
     pub(crate) async fn proxy_pass(
         self,
         compute_config: &ComputeConfig,
@@ -81,6 +81,9 @@ impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
         {
             tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
         }
+
+        drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
+
         res
     }
 }
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 6f6a8c9d47..ec080f270b 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -138,6 +138,12 @@ impl RateBucketInfo {
         Self::new(200, Duration::from_secs(600)),
     ];
 
+    // For all the sessions will be cancel key. So this limit is essentially global proxy limit.
+    pub const DEFAULT_REDIS_SET: [Self; 2] = [
+        Self::new(100_000, Duration::from_secs(1)),
+        Self::new(50_000, Duration::from_secs(10)),
+    ];
+
     /// All of these are per endpoint-maskedip pair.
     /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
     ///
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 228dbb7f64..30d8b83e60 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -2,12 +2,10 @@ use core::net::IpAddr;
 use std::sync::Arc;
 
 use pq_proto::CancelKeyData;
-use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 pub trait CancellationPublisherMut: Send + Sync + 'static {
@@ -83,9 +81,10 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 }
 
 pub struct RedisPublisherClient {
+    #[allow(dead_code)]
     client: ConnectionWithCredentialsProvider,
-    region_id: String,
-    limiter: GlobalRateLimiter,
+    _region_id: String,
+    _limiter: GlobalRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -96,26 +95,12 @@ impl RedisPublisherClient {
     ) -> anyhow::Result<Self> {
         Ok(Self {
             client,
-            region_id,
-            limiter: GlobalRateLimiter::new(info.into()),
+            _region_id: region_id,
+            _limiter: GlobalRateLimiter::new(info.into()),
         })
     }
 
-    async fn publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
-            region_id: Some(self.region_id.clone()),
-            cancel_key_data,
-            session_id,
-            peer_addr: Some(peer_addr),
-        }))?;
-        let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
-        Ok(())
-    }
+    #[allow(dead_code)]
     pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> {
         match self.client.connect().await {
             Ok(()) => {}
@@ -126,49 +111,4 @@ impl RedisPublisherClient {
         }
         Ok(())
     }
-    async fn try_publish_internal(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        // TODO: review redundant error duplication logs.
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping cancellation message");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-        match self.publish(cancel_key_data, session_id, peer_addr).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to publish a message: {e}");
-            }
-        }
-        tracing::info!("Publisher is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.publish(cancel_key_data, session_id, peer_addr).await
-    }
-}
-
-impl CancellationPublisherMut for RedisPublisherClient {
-    async fn try_publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        tracing::info!("publishing cancellation key to Redis");
-        match self
-            .try_publish_internal(cancel_key_data, session_id, peer_addr)
-            .await
-        {
-            Ok(()) => {
-                tracing::debug!("cancellation key successfuly published to Redis");
-                Ok(())
-            }
-            Err(e) => {
-                tracing::error!("failed to publish a message: {e}");
-                Err(e)
-            }
-        }
-    }
 }
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 0f6e765b02..b5c3d13216 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -29,6 +29,7 @@ impl Clone for Credentials {
 /// Provides PubSub connection without credentials refresh.
 pub struct ConnectionWithCredentialsProvider {
     credentials: Credentials,
+    // TODO: with more load on the connection, we should consider using a connection pool
     con: Option<MultiplexedConnection>,
     refresh_token_task: Option<JoinHandle<()>>,
     mutex: tokio::sync::Mutex<()>,
diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs
new file mode 100644
index 0000000000..dddc7e2054
--- /dev/null
+++ b/proxy/src/redis/keys.rs
@@ -0,0 +1,88 @@
+use anyhow::Ok;
+use pq_proto::{id_to_cancel_key, CancelKeyData};
+use serde::{Deserialize, Serialize};
+use std::io::ErrorKind;
+
+pub mod keyspace {
+    pub const CANCEL_PREFIX: &str = "cancel";
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) enum KeyPrefix {
+    #[serde(untagged)]
+    Cancel(CancelKeyData),
+}
+
+impl KeyPrefix {
+    pub(crate) fn build_redis_key(&self) -> String {
+        match self {
+            KeyPrefix::Cancel(key) => {
+                let hi = (key.backend_pid as u64) << 32;
+                let lo = (key.cancel_key as u64) & 0xffff_ffff;
+                let id = hi | lo;
+                let keyspace = keyspace::CANCEL_PREFIX;
+                format!("{keyspace}:{id:x}")
+            }
+        }
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX,
+        }
+    }
+}
+
+#[allow(dead_code)]
+pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result<KeyPrefix> {
+    let (prefix, key_str) = key.split_once(':').ok_or_else(|| {
+        anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "missing prefix"
+        ))
+    })?;
+
+    match prefix {
+        keyspace::CANCEL_PREFIX => {
+            let id = u64::from_str_radix(key_str, 16)?;
+
+            Ok(KeyPrefix::Cancel(id_to_cancel_key(id)))
+        }
+        _ => Err(anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "unknown prefix"
+        ))),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_build_redis_key() {
+        let cancel_key: KeyPrefix = KeyPrefix::Cancel(CancelKeyData {
+            backend_pid: 12345,
+            cancel_key: 54321,
+        });
+
+        let redis_key = cancel_key.build_redis_key();
+        assert_eq!(redis_key, "cancel:30390000d431");
+    }
+
+    #[test]
+    fn test_parse_redis_key() {
+        let redis_key = "cancel:30390000d431";
+        let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key");
+
+        let ref_key = CancelKeyData {
+            backend_pid: 12345,
+            cancel_key: 54321,
+        };
+
+        assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str());
+        let KeyPrefix::Cancel(cancel_key) = key;
+        assert_eq!(ref_key, cancel_key);
+    }
+}
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
new file mode 100644
index 0000000000..dcc6aac51b
--- /dev/null
+++ b/proxy/src/redis/kv_ops.rs
@@ -0,0 +1,185 @@
+use redis::{AsyncCommands, ToRedisArgs};
+
+use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+
+use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
+
+pub struct RedisKVClient {
+    client: ConnectionWithCredentialsProvider,
+    limiter: GlobalRateLimiter,
+}
+
+impl RedisKVClient {
+    pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
+        Self {
+            client,
+            limiter: GlobalRateLimiter::new(info.into()),
+        }
+    }
+
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.connect().await {
+            Ok(()) => {}
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e);
+            }
+        }
+        Ok(())
+    }
+
+    pub(crate) async fn hset<K, F, V>(&mut self, key: K, field: F, value: V) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+        F: ToRedisArgs + Send + Sync,
+        V: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hset");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hset(&key, &field, &value).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to set a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hset(key, field, value)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn hset_multiple<K, V>(
+        &mut self,
+        key: &str,
+        items: &[(K, V)],
+    ) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+        V: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hset_multiple");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hset_multiple(key, items).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to set a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hset_multiple(key, items)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn expire<K>(&mut self, key: K, seconds: i64) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping expire");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.expire(&key, seconds).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to set a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .expire(key, seconds)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) async fn hget<K, F, V>(&mut self, key: K, field: F) -> anyhow::Result<V>
+    where
+        K: ToRedisArgs + Send + Sync,
+        F: ToRedisArgs + Send + Sync,
+        V: redis::FromRedisValue,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hget");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hget(&key, &field).await {
+            Ok(value) => return Ok(value),
+            Err(e) => {
+                tracing::error!("failed to get a value: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hget(key, field)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
+    pub(crate) async fn hget_all<K, V>(&mut self, key: K) -> anyhow::Result<V>
+    where
+        K: ToRedisArgs + Send + Sync,
+        V: redis::FromRedisValue,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hgetall");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hgetall(&key).await {
+            Ok(value) => return Ok(value),
+            Err(e) => {
+                tracing::error!("failed to get a value: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client.hgetall(key).await.map_err(anyhow::Error::new)
+    }
+
+    pub(crate) async fn hdel<K, F>(&mut self, key: K, field: F) -> anyhow::Result<()>
+    where
+        K: ToRedisArgs + Send + Sync,
+        F: ToRedisArgs + Send + Sync,
+    {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping hdel");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+
+        match self.client.hdel(&key, &field).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to delete a key-value pair: {e}");
+            }
+        }
+
+        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.client
+            .hdel(key, field)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+}
diff --git a/proxy/src/redis/mod.rs b/proxy/src/redis/mod.rs
index a322f0368c..8b46a8e6ca 100644
--- a/proxy/src/redis/mod.rs
+++ b/proxy/src/redis/mod.rs
@@ -1,4 +1,6 @@
 pub mod cancellation_publisher;
 pub mod connection_with_credentials_provider;
 pub mod elasticache;
+pub mod keys;
+pub mod kv_ops;
 pub mod notifications;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 63cdf6176c..19fdd3280d 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -6,18 +6,14 @@ use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
-use crate::cancellation::{CancelMap, CancellationHandler};
-use crate::config::ProxyConfig;
 use crate::intern::{ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
-pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
 
@@ -25,8 +21,6 @@ async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Resu
     let mut conn = client.get_async_pubsub().await?;
     tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
     conn.subscribe(CPLANE_CHANNEL_NAME).await?;
-    tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
-    conn.subscribe(PROXY_CHANNEL_NAME).await?;
     Ok(conn)
 }
 
@@ -71,8 +65,6 @@ pub(crate) enum Notification {
         deserialize_with = "deserialize_json_string"
     )]
     PasswordUpdate { password_update: PasswordUpdate },
-    #[serde(rename = "/cancel_session")]
-    Cancel(CancelSession),
 
     #[serde(
         other,
@@ -138,7 +130,6 @@ where
 
 struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
     cache: Arc<C>,
-    cancellation_handler: Arc<CancellationHandler<()>>,
     region_id: String,
 }
 
@@ -146,23 +137,14 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
     fn clone(&self) -> Self {
         Self {
             cache: self.cache.clone(),
-            cancellation_handler: self.cancellation_handler.clone(),
             region_id: self.region_id.clone(),
         }
     }
 }
 
 impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
-    pub(crate) fn new(
-        cache: Arc<C>,
-        cancellation_handler: Arc<CancellationHandler<()>>,
-        region_id: String,
-    ) -> Self {
-        Self {
-            cache,
-            cancellation_handler,
-            region_id,
-        }
+    pub(crate) fn new(cache: Arc<C>, region_id: String) -> Self {
+        Self { cache, region_id }
     }
 
     pub(crate) async fn increment_active_listeners(&self) {
@@ -207,46 +189,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
 
         tracing::debug!(?msg, "received a message");
         match msg {
-            Notification::Cancel(cancel_session) => {
-                tracing::Span::current().record(
-                    "session_id",
-                    tracing::field::display(cancel_session.session_id),
-                );
-                Metrics::get()
-                    .proxy
-                    .redis_events_count
-                    .inc(RedisEventsCount::CancelSession);
-                if let Some(cancel_region) = cancel_session.region_id {
-                    // If the message is not for this region, ignore it.
-                    if cancel_region != self.region_id {
-                        return Ok(());
-                    }
-                }
-
-                // TODO: Remove unspecified peer_addr after the complete migration to the new format
-                let peer_addr = cancel_session
-                    .peer_addr
-                    .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
-                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id);
-                cancel_span.follows_from(tracing::Span::current());
-                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
-                match self
-                    .cancellation_handler
-                    .cancel_session(
-                        cancel_session.cancel_key_data,
-                        uuid::Uuid::nil(),
-                        peer_addr,
-                        cancel_session.peer_addr.is_some(),
-                    )
-                    .instrument(cancel_span)
-                    .await
-                {
-                    Ok(()) => {}
-                    Err(e) => {
-                        tracing::warn!("failed to cancel session: {e}");
-                    }
-                }
-            }
             Notification::AllowedIpsUpdate { .. }
             | Notification::PasswordUpdate { .. }
             | Notification::BlockPublicOrVpcAccessUpdated { .. }
@@ -293,7 +235,6 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
                 password_update.project_id,
                 password_update.role_name,
             ),
-        Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
         Notification::BlockPublicOrVpcAccessUpdated { .. } => {
             // https://github.com/neondatabase/neon/pull/10073
         }
@@ -323,8 +264,8 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
             }
             Err(e) => {
                 tracing::error!(
-            "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
-        );
+                    "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
+                );
                 tokio::time::sleep(RECONNECT_TIMEOUT).await;
                 continue;
             }
@@ -350,21 +291,14 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "redis_notifications", skip_all)]
 pub async fn task_main<C>(
-    config: &'static ProxyConfig,
     redis: ConnectionWithCredentialsProvider,
     cache: Arc<C>,
-    cancel_map: CancelMap,
     region_id: String,
 ) -> anyhow::Result<Infallible>
 where
     C: ProjectInfoCache + Send + Sync + 'static,
 {
-    let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
-        &config.connect_to_compute,
-        cancel_map,
-        crate::metrics::CancellationSource::FromRedis,
-    ));
-    let handler = MessageHandler::new(cache, cancellation_handler, region_id);
+    let handler = MessageHandler::new(cache, region_id);
     // 6h - 1m.
     // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost.
     let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60));
@@ -442,35 +376,6 @@ mod tests {
 
         Ok(())
     }
-    #[test]
-    fn parse_cancel_session() -> anyhow::Result<()> {
-        let cancel_key_data = CancelKeyData {
-            backend_pid: 42,
-            cancel_key: 41,
-        };
-        let uuid = uuid::Uuid::new_v4();
-        let msg = Notification::Cancel(CancelSession {
-            cancel_key_data,
-            region_id: None,
-            session_id: uuid,
-            peer_addr: None,
-        });
-        let text = serde_json::to_string(&msg)?;
-        let result: Notification = serde_json::from_str(&text)?;
-        assert_eq!(msg, result);
-
-        let msg = Notification::Cancel(CancelSession {
-            cancel_key_data,
-            region_id: Some("region".to_string()),
-            session_id: uuid,
-            peer_addr: None,
-        });
-        let text = serde_json::to_string(&msg)?;
-        let result: Notification = serde_json::from_str(&text)?;
-        assert_eq!(msg, result,);
-
-        Ok(())
-    }
 
     #[test]
     fn parse_unknown_topic() -> anyhow::Result<()> {
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index c2623e0eca..6888772362 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -43,7 +43,7 @@ use tokio_util::task::TaskTracker;
 use tracing::{info, warn, Instrument};
 use utils::http::error::ApiError;
 
-use crate::cancellation::CancellationHandlerMain;
+use crate::cancellation::CancellationHandler;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::ext::TaskExt;
@@ -61,7 +61,7 @@ pub async fn task_main(
     auth_backend: &'static crate::auth::Backend<'static, ()>,
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -318,7 +318,7 @@ async fn connection_handler(
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
     cancellations: TaskTracker,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     conn: AsyncRW,
@@ -412,7 +412,7 @@ async fn request_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     session_id: uuid::Uuid,
     conn_info: ConnectionInfo,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 47326c1181..585a7d63b2 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -12,7 +12,7 @@ use pin_project_lite::pin_project;
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;
 
-use crate::cancellation::CancellationHandlerMain;
+use crate::cancellation::CancellationHandler;
 use crate::config::ProxyConfig;
 use crate::context::RequestContext;
 use crate::error::{io_error, ReportableError};
@@ -129,7 +129,7 @@ pub(crate) async fn serve_websocket(
     auth_backend: &'static crate::auth::Backend<'static, ()>,
     ctx: RequestContext,
     websocket: OnUpgrade,
-    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_handler: Arc<CancellationHandler>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
     cancellations: tokio_util::task::task_tracker::TaskTracker,

From 2f82c21c638578332b61593bdf2fc83fe41de2db Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 29 Jan 2025 12:55:24 +0000
Subject: [PATCH 38/72] chore: update rust-postgres fork (#10557)

I updated the fork to fix some lints. Cargo keeps getting confused by it
so let's just update the lockfile here
---
 Cargo.lock | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c19fdc0941..f14f4cdb82 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4449,7 +4449,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4462,7 +4462,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4514,9 +4514,10 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "bytes",
+ "chrono",
  "fallible-iterator",
  "postgres-protocol 0.6.4",
 ]
@@ -6859,7 +6860,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
 dependencies = [
  "async-trait",
  "byteorder",

From 34d9e2d8e33ba9afe1a7f68241fe84a9055a4754 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 29 Jan 2025 16:01:56 +0100
Subject: [PATCH 39/72] Add a test for GrapgQL (#10156)

## Problem
We currently don't run the tests shipped with `pg_graphql`.
## Summary of changes
The tests for `pg_graphql` are added.
---
 compute/compute-node.Dockerfile               |  4 +++-
 compute/patches/pg_graphql.patch              | 19 +++++++++++++++++++
 docker-compose/docker_compose_test.sh         |  5 +++--
 .../ext-src/pg_graphql-src/neon-test.sh       | 13 +++++++++++++
 docker-compose/run-tests.sh                   | 17 ++++++++++-------
 5 files changed, 48 insertions(+), 10 deletions(-)
 create mode 100644 compute/patches/pg_graphql.patch
 create mode 100755 docker-compose/ext-src/pg_graphql-src/neon-test.sh

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 539135470e..9d7aeda590 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1330,7 +1330,8 @@ COPY --from=vector-pg-build /pgvector.patch /ext-src/
 COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 #COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
-#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
+COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
+COPY compute/patches/pg_graphql.patch /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
@@ -1364,6 +1365,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN patch -p1 </ext-src/pg_cron.patch
+RUN cd /ext-src/pg_graphql-src && patch -p1 </ext-src/pg_graphql.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
diff --git a/compute/patches/pg_graphql.patch b/compute/patches/pg_graphql.patch
new file mode 100644
index 0000000000..bf0ac38afa
--- /dev/null
+++ b/compute/patches/pg_graphql.patch
@@ -0,0 +1,19 @@
+commit ec6a491d126882966a696f9ad5d3698935361d55
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Tue Dec 17 10:25:00 2024 +0100
+
+    Changes required to run tests on Neon
+
+diff --git a/test/expected/permissions_functions.out b/test/expected/permissions_functions.out
+index 1e9fbc2..94cbe25 100644
+--- a/test/expected/permissions_functions.out
++++ b/test/expected/permissions_functions.out
+@@ -64,7 +64,7 @@ begin;
+     select current_user;
+  current_user 
+ --------------
+- postgres
++ cloud_admin
+ (1 row)
+ 
+     -- revoke default access from the public role for new functions
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index f42aca673b..a05d6c043d 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -31,7 +31,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     echo "clean up containers if exists"
     cleanup
     PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --quiet-pull --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
     cnt=0
@@ -51,6 +51,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     done
 
     if [ $pg_version -ge 16 ]; then
+        docker cp ext-src $TEST_CONTAINER_NAME:/
         # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
         # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
         echo Adding dummy config
@@ -61,7 +62,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
         # We are running tests now
-        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
             $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
         then
             FAILED=$(tail -1 testout.txt)
diff --git a/docker-compose/ext-src/pg_graphql-src/neon-test.sh b/docker-compose/ext-src/pg_graphql-src/neon-test.sh
new file mode 100755
index 0000000000..38bcb4bfb6
--- /dev/null
+++ b/docker-compose/ext-src/pg_graphql-src/neon-test.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+dropdb --if-exists contrib_regression
+createdb contrib_regression
+PGXS="$(dirname "$(pg_config --pgxs)" )"
+REGRESS="${PGXS}/../test/regress/pg_regress"
+TESTDIR="test"
+TESTS=$(ls "${TESTDIR}/sql" | sort )
+TESTS=${TESTS//\.sql/}
+psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression
+${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS}
+
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 9873187b62..1e794a42a1 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -4,14 +4,17 @@ set -x
 cd /ext-src || exit 2
 FAILED=
 LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
-for d in ${LIST}
-do
-       [ -d "${d}" ] || continue
-       if ! psql -w -c "select 1" >/dev/null; then
-          FAILED="${d} ${FAILED}"
-          break
-       fi
+for d in ${LIST}; do
+    [ -d "${d}" ] || continue
+    if ! psql -w -c "select 1" >/dev/null; then
+      FAILED="${d} ${FAILED}"
+      break
+    fi
+    if [ -f "${d}/neon-test.sh" ]; then
+       "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
+    else
        USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+    fi
 done
 [ -z "${FAILED}" ] && exit 0
 echo "${FAILED}"

From 7922458b98c3f8f7519a60b3261d69cf00575a06 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 29 Jan 2025 09:45:36 -0600
Subject: [PATCH 40/72] Use num_cpus from the workspace in pageserver (#10545)

Luckily they were the same version, so we didn't spend time compiling
two versions, which could have been the case in the future.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pageserver/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9c835c956b..6e4eaa0efd 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,7 +36,7 @@ itertools.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
-num_cpus = { version = "1.15" }
+num_cpus.workspace = true
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true

From 34e560fe373821114049dd97144d80d31171b13b Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 29 Jan 2025 15:52:00 +0000
Subject: [PATCH 41/72] download exporters from releases rather than using
 docker images (#10551)

Use releases for postgres-exporter, pgbouncer-exporter, and sql-exporter
---
 compute/compute-node.Dockerfile | 40 +++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9d7aeda590..7ac6e9bc58 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -5,6 +5,7 @@ ARG TAG=pinned
 ARG BUILD_TAG
 ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
+ARG ALPINE_CURL_VERSION=8.11.1
 
 #########################################################################################
 #
@@ -1266,16 +1267,31 @@ RUN set -e \
 
 #########################################################################################
 #
-# Layers "postgres-exporter", "pgbouncer-exporter", and "sql-exporter"
+# Layer "exporters"
 #
 #########################################################################################
-
-FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
-FROM quay.io/prometheuscommunity/pgbouncer-exporter:v0.10.2 AS pgbouncer-exporter
-
-# Keep the version the same as in build-tools.Dockerfile and
-# test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
+FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
+ARG TARGETARCH
+# Keep sql_exporter version same as in build-tools.Dockerfile and
+# test_runner/regress/test_compute_metrics.py
+RUN if [ "$TARGETARCH" = "amd64" ]; then\
+        postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
+        pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
+        sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
+    else\
+        postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
+        pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
+        sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
+    fi\
+    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
+     | tar xzf - --strip-components=1 -C.\
+    && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
+     | tar xzf - --strip-components=1 -C.\
+    && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\
+     | tar xzf - --strip-components=1 -C.\
+    && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\
+    && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
+    && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -
 
 #########################################################################################
 #
@@ -1403,10 +1419,10 @@ COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
 RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 
-# Metrics exporter binaries and  configuration files
-COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
-COPY --from=pgbouncer-exporter /bin/pgbouncer_exporter /bin/pgbouncer_exporter
-COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
+# Metrics exporter binaries and configuration files
+COPY --from=exporters ./postgres_exporter /bin/postgres_exporter
+COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter
+COPY --from=exporters ./sql_exporter /bin/sql_exporter
 
 COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
 

From 190c19c0344b3720f62e80679634872d390aaa3a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 29 Jan 2025 17:02:07 +0000
Subject: [PATCH 42/72] chore: update rust-postgres on rebase (#10561)

I tried a full update of our tokio-postgres fork before. We hit some
breaking change. This PR only pulls in ~50% of the changes from
upstream: https://github.com/neondatabase/rust-postgres/pull/38.
---
 Cargo.lock | 58 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f14f4cdb82..9ba90355df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1312,7 +1312,7 @@ dependencies = [
  "tar",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-util",
  "tower 0.5.2",
@@ -1421,7 +1421,7 @@ dependencies = [
  "storage_broker",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-util",
  "toml",
  "toml_edit",
@@ -4060,8 +4060,8 @@ dependencies = [
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
- "postgres-protocol 0.6.4",
- "postgres-types 0.2.4",
+ "postgres-protocol 0.6.6",
+ "postgres-types 0.2.6",
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
@@ -4092,7 +4092,7 @@ dependencies = [
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -4150,7 +4150,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-util",
  "utils",
@@ -4448,23 +4448,23 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.19.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
  "bytes",
  "fallible-iterator",
  "futures-util",
  "log",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
 ]
 
 [[package]]
 name = "postgres-protocol"
-version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.6.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
- "base64 0.20.0",
+ "base64 0.21.1",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -4513,13 +4513,13 @@ dependencies = [
 
 [[package]]
 name = "postgres-types"
-version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.2.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
  "bytes",
  "chrono",
  "fallible-iterator",
- "postgres-protocol 0.6.4",
+ "postgres-protocol 0.6.6",
 ]
 
 [[package]]
@@ -4555,7 +4555,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-postgres-rustls",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -4570,7 +4570,7 @@ dependencies = [
  "itertools 0.10.5",
  "once_cell",
  "postgres",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "url",
 ]
 
@@ -4664,7 +4664,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "postgres-protocol 0.6.4",
+ "postgres-protocol 0.6.6",
  "rand 0.8.5",
  "serde",
  "thiserror 1.0.69",
@@ -4912,7 +4912,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite 0.21.0",
@@ -5700,7 +5700,7 @@ dependencies = [
  "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
- "postgres-protocol 0.6.4",
+ "postgres-protocol 0.6.6",
  "postgres_backend",
  "postgres_ffi",
  "pprof",
@@ -5724,7 +5724,7 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -6394,7 +6394,7 @@ dependencies = [
  "serde_json",
  "storage_controller_client",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-postgres-rustls",
  "tokio-stream",
  "tokio-util",
@@ -6859,8 +6859,8 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#547812b5e4972425fc3a9108cf9bae39e41ee000"
+version = "0.7.9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -6873,11 +6873,13 @@ dependencies = [
  "percent-encoding",
  "phf",
  "pin-project-lite",
- "postgres-protocol 0.6.4",
- "postgres-types 0.2.4",
+ "postgres-protocol 0.6.6",
+ "postgres-types 0.2.6",
+ "rand 0.8.5",
  "socket2",
  "tokio",
  "tokio-util",
+ "whoami",
 ]
 
 [[package]]
@@ -6915,7 +6917,7 @@ dependencies = [
  "ring",
  "rustls 0.23.18",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
@@ -7593,7 +7595,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres 0.7.9",
  "tokio-util",
  "tracing",
  "tracing-subscriber",

From fdfbc7b358585a6b0ee013f0fa66ada3a0979eb0 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 29 Jan 2025 17:08:25 +0000
Subject: [PATCH 43/72] pageserver: hold GC while reading from a timeline
 (#10559)

## Problem

If we are GC-ing because a new image layer was added while traversing
the timeline, then it will remove layers that are required for
fulfilling the current get request (read-path cannot "look back" and
notice the new image layer).

## Summary of Changes

Prevent GC from progressing on the current timeline while it is being
visited for a read.

Epic: https://github.com/neondatabase/neon/issues/9376
---
 pageserver/src/tenant/timeline.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f3cdad82d9..24bc7890c6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3468,6 +3468,13 @@ impl Timeline {
         let mut completed_keyspace = KeySpace::default();
         let mut image_covered_keyspace = KeySpaceRandomAccum::new();
 
+        // Prevent GC from progressing while visiting the current timeline.
+        // If we are GC-ing because a new image layer was added while traversing
+        // the timeline, then it will remove layers that are required for fulfilling
+        // the current get request (read-path cannot "look back" and notice the new
+        // image layer).
+        let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
+
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);

From 34322b2424e9f00a4d8f4b07ce23464012dda151 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 29 Jan 2025 19:09:25 +0100
Subject: [PATCH 44/72] chore(compute): Simplify new compute_ctl metrics and
 fix flaky test (#10560)

## Problem

1. d04d924 added separate metrics for total requests and failures
separately, but it doesn't make much sense. We could just have a unified
counter with `http_status`.
2. `test_compute_migrations_retry` had a race, i.e., it was waiting for
the last successful migration, not an actual failure. This was revealed
after adding an assert on failure metric in d04d924.

## Summary of changes

1. Switch to unified counters for `compute_ctl` requests.
2. Add a waiting loop into `test_compute_migrations_retry` to eliminate
the race.

Part of neondatabase/cloud#17590
---
 compute_tools/src/extension_server.rs         | 26 +++++-----
 compute_tools/src/metrics.rs                  | 26 ++--------
 compute_tools/src/spec.rs                     | 50 +++++++++----------
 .../regress/test_compute_migrations.py        | 33 +++++++-----
 4 files changed, 61 insertions(+), 74 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index fa638c74b3..64c338f4d7 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -85,7 +85,7 @@ use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
 
-use crate::metrics::{REMOTE_EXT_REQUESTS_FAILED, REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
+use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 
 fn get_pg_config(argument: &str, pgbin: &str) -> String {
     // gives the result of `pg_config [argument]`
@@ -260,22 +260,20 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 
     info!("Download extension {:?} from uri {:?}", ext_path, uri);
 
-    REMOTE_EXT_REQUESTS_TOTAL.with_label_values(&[]).inc();
-
     match do_extension_server_request(&uri).await {
         Ok(resp) => {
             info!(
                 "Successfully downloaded remote extension data {:?}",
                 ext_path
             );
+            REMOTE_EXT_REQUESTS_TOTAL
+                .with_label_values(&[&StatusCode::OK.to_string()])
+                .inc();
             Ok(resp)
         }
         Err((msg, status)) => {
-            let status_str = status
-                .map(|s| s.to_string())
-                .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
-            REMOTE_EXT_REQUESTS_FAILED
-                .with_label_values(&[&status_str])
+            REMOTE_EXT_REQUESTS_TOTAL
+                .with_label_values(&[&status])
                 .inc();
             bail!(msg);
         }
@@ -283,12 +281,12 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 }
 
 // Do a single remote extensions server request.
-// Return result or (error message + status code) in case of any failures.
-async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option<StatusCode>)> {
+// Return result or (error message + stringified status code) in case of any failures.
+async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
     let resp = reqwest::get(uri).await.map_err(|e| {
         (
             format!("could not perform remote extensions server request: {}", e),
-            None,
+            UNKNOWN_HTTP_STATUS.to_string(),
         )
     })?;
     let status = resp.status();
@@ -300,19 +298,19 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option
                 format!("could not read remote extensions server response: {}", e),
                 // It's fine to return and report error with status as 200 OK,
                 // because we still failed to read the response.
-                Some(status),
+                status.to_string(),
             )),
         },
         StatusCode::SERVICE_UNAVAILABLE => Err((
             "remote extensions server is temporarily unavailable".to_string(),
-            Some(status),
+            status.to_string(),
         )),
         _ => Err((
             format!(
                 "unexpected remote extensions server response status code: {}",
                 status
             ),
-            Some(status),
+            status.to_string(),
         )),
     }
 }
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 3684338571..870b294d08 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -32,16 +32,7 @@ pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
 pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "compute_ctl_cplane_requests_total",
-        "Total number of control plane requests made by compute_ctl",
-        &["rpc"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "compute_ctl_cplane_requests_failed_total",
-        "Total number of failed control plane requests made by compute_ctl",
+        "Total number of control plane requests made by compute_ctl by status",
         &["rpc", "http_status"]
     )
     .expect("failed to define a metric")
@@ -62,18 +53,9 @@ pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
 pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "compute_ctl_remote_ext_requests_total",
-        "Total number of requests made by compute_ctl to download extensions from S3 proxy",
+        "Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
         // Do not use any labels like extension name yet.
         // We can add them later if needed.
-        &[]
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "compute_ctl_remote_ext_requests_failed_total",
-        "Total number of failed requests to S3 proxy",
         &["http_status"]
     )
     .expect("failed to define a metric")
@@ -82,9 +64,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(||
 pub fn collect() -> Vec<MetricFamily> {
     let mut metrics = INSTALLED_EXTENSIONS.collect();
     metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
-    metrics.extend(CPLANE_REQUESTS_FAILED.collect());
-    metrics.extend(DB_MIGRATION_FAILED.collect());
     metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
-    metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect());
+    metrics.extend(DB_MIGRATION_FAILED.collect());
     metrics
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 01de13811f..43a820885b 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -6,9 +6,7 @@ use std::path::Path;
 use tracing::{error, info, instrument, warn};
 
 use crate::config;
-use crate::metrics::{
-    CPlaneRequestRPC, CPLANE_REQUESTS_FAILED, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS,
-};
+use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -22,7 +20,7 @@ use compute_api::spec::ComputeSpec;
 fn do_control_plane_request(
     uri: &str,
     jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, Option<StatusCode>)> {
+) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
     let resp = reqwest::blocking::Client::new()
         .get(uri)
         .header("Authorization", format!("Bearer {}", jwt))
@@ -31,7 +29,7 @@ fn do_control_plane_request(
             (
                 true,
                 format!("could not perform spec request to control plane: {}", e),
-                None,
+                UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
 
@@ -42,13 +40,13 @@ fn do_control_plane_request(
             Err(e) => Err((
                 true,
                 format!("could not deserialize control plane response: {}", e),
-                Some(status),
+                status.to_string(),
             )),
         },
         StatusCode::SERVICE_UNAVAILABLE => Err((
             true,
             "control plane is temporarily unavailable".to_string(),
-            Some(status),
+            status.to_string(),
         )),
         StatusCode::BAD_GATEWAY => {
             // We have a problem with intermittent 502 errors now
@@ -57,7 +55,7 @@ fn do_control_plane_request(
             Err((
                 true,
                 "control plane request failed with 502".to_string(),
-                Some(status),
+                status.to_string(),
             ))
         }
         // Another code, likely 500 or 404, means that compute is unknown to the control plane
@@ -65,7 +63,7 @@ fn do_control_plane_request(
         _ => Err((
             false,
             format!("unexpected control plane response status code: {}", status),
-            Some(status),
+            status.to_string(),
         )),
     }
 }
@@ -92,26 +90,28 @@ pub fn get_spec_from_control_plane(
     // - no spec for compute yet (Empty state) -> return Ok(None)
     // - got spec -> return Ok(Some(spec))
     while attempt < 4 {
-        CPLANE_REQUESTS_TOTAL
-            .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str()])
-            .inc();
         spec = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => match spec_resp.status {
-                ControlPlaneComputeStatus::Empty => Ok(None),
-                ControlPlaneComputeStatus::Attached => {
-                    if let Some(spec) = spec_resp.spec {
-                        Ok(Some(spec))
-                    } else {
-                        bail!("compute is attached, but spec is empty")
+            Ok(spec_resp) => {
+                CPLANE_REQUESTS_TOTAL
+                    .with_label_values(&[
+                        CPlaneRequestRPC::GetSpec.as_str(),
+                        &StatusCode::OK.to_string(),
+                    ])
+                    .inc();
+                match spec_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(None),
+                    ControlPlaneComputeStatus::Attached => {
+                        if let Some(spec) = spec_resp.spec {
+                            Ok(Some(spec))
+                        } else {
+                            bail!("compute is attached, but spec is empty")
+                        }
                     }
                 }
-            },
+            }
             Err((retry, msg, status)) => {
-                let status_str = status
-                    .map(|s| s.to_string())
-                    .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
-                CPLANE_REQUESTS_FAILED
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status_str])
+                CPLANE_REQUESTS_TOTAL
+                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
                     .inc();
                 if retry {
                     Err(anyhow!(msg))
diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py
index ec2e38f021..0dbb187c39 100644
--- a/test_runner/regress/test_compute_migrations.py
+++ b/test_runner/regress/test_compute_migrations.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, cast
 import pytest
 from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
 from fixtures.metrics import parse_metrics
+from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv
@@ -24,7 +25,26 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
     for i in range(1, NUM_COMPUTE_MIGRATIONS + 1):
         endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"})
 
-        # Make sure that the migrations ran
+        # Check that migration failure is properly recorded in the metrics
+        #
+        # N.B. wait_for_migrations() only waits till the last successful
+        # migration is applied. It doesn't wait till the migration failure due
+        # to the failpoint. This opens a race for checking the metrics. To avoid
+        # this, we first wait until the migration failure metric is seen.
+        def check_migration_failure_metrics():
+            client = endpoint.http_client()
+            raw_metrics = client.metrics()
+            metrics = parse_metrics(raw_metrics)
+            failed_migration = metrics.query_all(
+                "compute_ctl_db_migration_failed_total",
+            )
+            assert len(failed_migration) == 1
+            for sample in failed_migration:
+                assert sample.value == 1
+
+        wait_until(check_migration_failure_metrics)
+
+        # Make sure that all migrations before the failed one are applied
         endpoint.wait_for_migrations(wait_for=i - 1)
 
         # Confirm that we correctly recorded that in the
@@ -34,17 +54,6 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
             migration_id = cast("int", cur.fetchall()[0][0])
             assert migration_id == i - 1
 
-        # Check that migration failure is properly recorded in the metrics
-        client = endpoint.http_client()
-        raw_metrics = client.metrics()
-        metrics = parse_metrics(raw_metrics)
-        failed_migration = metrics.query_all(
-            "compute_ctl_db_migration_failed_total",
-        )
-        assert len(failed_migration) == 1
-        for sample in failed_migration:
-            assert sample.value == 1
-
         endpoint.stop()
 
     endpoint.start()

From 5bcefb4ee13b0ac5c04747a8b61b56dacdb65984 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:43:39 -0500
Subject: [PATCH 45/72] fix(pageserver): compaction perftest wrt upper limit
 (#10564)

## Problem

The config is added in https://github.com/neondatabase/neon/pull/10550
causing behavior change for l0 compaction.

close https://github.com/neondatabase/neon/issues/10562

## Summary of changes

Fix the test case to consider the effect of upper_limit.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 2 +-
 test_runner/performance/test_compaction.py   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 76dcc159ea..589aea18b4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1112,7 +1112,7 @@ impl Timeline {
         // Accumulate the size of layers in `deltas_to_compact`
         let mut deltas_to_compact_bytes = 0;
 
-        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
+        // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size
         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
         // work in this function to only operate on this much delta data at once.
         let delta_size_limit = self.get_compaction_upper_limit() as u64
diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
index 0cd1080fa7..eaa89ae754 100644
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -75,6 +75,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
             # Initially disable compaction so that we will build up a stack of L0s
             "compaction_period": "0s",
             "gc_period": "0s",
+            "compaction_upper_limit": 12,
         }
     )
     neon_compare.tenant = tenant_id
@@ -91,6 +92,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
     tenant_conf = pageserver_http.tenant_config(tenant_id)
     assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024
     assert tenant_conf.effective_config["compaction_threshold"] == 10
+    assert tenant_conf.effective_config["compaction_upper_limit"] == 12
 
     # Aim to write about 20 L0s, so that we will hit the limit on how many
     # to compact at once

From 707a9260573c58b908f789c5e92301f9e5ff77ab Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 29 Jan 2025 13:22:01 -0600
Subject: [PATCH 46/72] Remove unused compute_ctl HTTP routes (#10544)

These are not used anywhere within the platform, so let's remove dead
code.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/http/openapi_spec.yaml      |  29 ----
 compute_tools/src/http/routes/info.rs         |  11 --
 .../src/http/routes/installed_extensions.rs   |  33 ----
 compute_tools/src/http/routes/mod.rs          |   2 -
 compute_tools/src/http/server.rs              |   8 +-
 libs/compute_api/src/responses.rs             |   5 -
 test_runner/fixtures/endpoint/http.py         |   5 -
 test_runner/regress/test_compute_metrics.py   |  91 +++++++++++
 .../regress/test_installed_extensions.py      | 154 ------------------
 9 files changed, 92 insertions(+), 246 deletions(-)
 delete mode 100644 compute_tools/src/http/routes/info.rs
 delete mode 100644 compute_tools/src/http/routes/installed_extensions.rs
 delete mode 100644 test_runner/regress/test_installed_extensions.py

diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 50319cdd85..bbdb7d0917 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,35 +68,6 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeInsights"
 
-  /installed_extensions:
-    get:
-      tags:
-      - Info
-      summary: Get installed extensions.
-      description: ""
-      operationId: getInstalledExtensions
-      responses:
-        200:
-          description: List of installed extensions
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/InstalledExtensions"
-  /info:
-    get:
-      tags:
-      - Info
-      summary: Get info about the compute pod / VM.
-      description: ""
-      operationId: getInfo
-      responses:
-        200:
-          description: Info
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Info"
-
   /dbs_and_roles:
     get:
       tags:
diff --git a/compute_tools/src/http/routes/info.rs b/compute_tools/src/http/routes/info.rs
deleted file mode 100644
index 32d6fea74c..0000000000
--- a/compute_tools/src/http/routes/info.rs
+++ /dev/null
@@ -1,11 +0,0 @@
-use axum::response::Response;
-use compute_api::responses::InfoResponse;
-use http::StatusCode;
-
-use crate::http::JsonResponse;
-
-/// Get information about the physical characteristics about the compute.
-pub(in crate::http) async fn get_info() -> Response {
-    let num_cpus = num_cpus::get_physical();
-    JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
-}
diff --git a/compute_tools/src/http/routes/installed_extensions.rs b/compute_tools/src/http/routes/installed_extensions.rs
deleted file mode 100644
index db74a6b195..0000000000
--- a/compute_tools/src/http/routes/installed_extensions.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::responses::ComputeStatus;
-use http::StatusCode;
-use tokio::task;
-
-use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
-
-/// Get a list of installed extensions.
-pub(in crate::http) async fn get_installed_extensions(
-    State(compute): State<Arc<ComputeNode>>,
-) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    let conf = compute.get_conn_conf(None);
-    let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
-        .await
-        .unwrap();
-
-    match res {
-        Ok(installed_extensions) => {
-            JsonResponse::success(StatusCode::OK, Some(installed_extensions))
-        }
-        Err(e) => JsonResponse::error(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            format!("failed to get list of installed extensions: {e}"),
-        ),
-    }
-}
diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs
index 3efa1153ad..a67be7fd5a 100644
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -10,9 +10,7 @@ pub(in crate::http) mod extension_server;
 pub(in crate::http) mod extensions;
 pub(in crate::http) mod failpoints;
 pub(in crate::http) mod grants;
-pub(in crate::http) mod info;
 pub(in crate::http) mod insights;
-pub(in crate::http) mod installed_extensions;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
 pub(in crate::http) mod status;
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index da650585fc..e41ed9df2d 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -22,8 +22,7 @@ use uuid::Uuid;
 
 use super::routes::{
     check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-    grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
-    terminate,
+    grants, insights, metrics, metrics_json, status, terminate,
 };
 use crate::compute::ComputeNode;
 
@@ -60,12 +59,7 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
         )
         .route("/extensions", post(extensions::install_extension))
         .route("/grants", post(grants::add_grant))
-        .route("/info", get(info_route::get_info))
         .route("/insights", get(insights::get_insights))
-        .route(
-            "/installed_extensions",
-            get(installed_extensions::get_installed_extensions),
-        )
         .route("/metrics", get(metrics::get_metrics))
         .route("/metrics.json", get(metrics_json::get_metrics))
         .route("/status", get(status::get_status))
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 9ce605089b..5286e0e61d 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -15,11 +15,6 @@ pub struct GenericAPIError {
     pub error: String,
 }
 
-#[derive(Debug, Clone, Serialize)]
-pub struct InfoResponse {
-    pub num_cpus: usize,
-}
-
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
     pub extension: PgIdent,
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index aa0d95fe80..6e8210e978 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -28,11 +28,6 @@ class EndpointHttpClient(requests.Session):
         res.raise_for_status()
         return res.text
 
-    def installed_extensions(self):
-        res = self.get(f"http://localhost:{self.port}/installed_extensions")
-        res.raise_for_status()
-        return res.json()
-
     def extensions(self, extension: str, version: str, database: str):
         body = {
             "extension": extension,
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 5dcc93acff..99d41e410a 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -5,16 +5,22 @@ import os
 import shutil
 import sys
 from enum import StrEnum
+from logging import debug
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
 
 import pytest
 import requests
 import yaml
+from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
+from fixtures.utils import wait_until
+from prometheus_client.samples import Sample
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from types import TracebackType
     from typing import Self, TypedDict
 
@@ -467,3 +473,88 @@ def test_perf_counters(neon_simple_env: NeonEnv):
     cur.execute("CREATE EXTENSION neon VERSION '1.5'")
     cur.execute("SELECT * FROM neon_perf_counters")
     cur.execute("SELECT * FROM neon_backend_perf_counters")
+
+
+def collect_metric(
+    client: EndpointHttpClient,
+    name: str,
+    filter: dict[str, str],
+    predicate: Callable[[list[Sample]], bool],
+) -> Callable[[], list[Sample]]:
+    """
+    Call this function as the first argument to wait_until().
+    """
+
+    def __collect_metric() -> list[Sample]:
+        resp = client.metrics()
+        debug("Metrics: %s", resp)
+        m = parse_metrics(resp)
+        samples = m.query_all(name, filter)
+        debug("Samples: %s", samples)
+        assert predicate(samples), "predicate failed"
+        return samples
+
+    return __collect_metric
+
+
+def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv):
+    """
+    Test that the compute_installed_extensions properly reports accurate
+    results. Important to note that currently this metric is only gathered on
+    compute start.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    client = endpoint.http_client()
+
+    def __has_plpgsql(samples: list[Sample]) -> bool:
+        """
+        Check that plpgsql is installed in the template1 and postgres databases
+        """
+        return len(samples) == 1 and samples[0].value == 2
+
+    wait_until(
+        collect_metric(
+            client,
+            "compute_installed_extensions",
+            {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
+            __has_plpgsql,
+        ),
+        name="compute_installed_extensions",
+    )
+
+    # Install the neon extension, so we can check for it on the restart
+    endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'")
+
+    # The metric is only gathered on compute start, so restart to check if the
+    # neon extension will now be there.
+    endpoint.stop()
+    endpoint.start()
+
+    client = endpoint.http_client()
+
+    def __has_neon(samples: list[Sample]) -> bool:
+        return len(samples) == 1 and samples[0].value == 1
+
+    wait_until(
+        collect_metric(
+            client,
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.0", "owned_by_superuser": "1"},
+            __has_neon,
+        ),
+        name="compute_installed_extensions",
+    )
+
+    # Double check that we also still have plpgsql
+    wait_until(
+        collect_metric(
+            client,
+            "compute_installed_extensions",
+            {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"},
+            __has_plpgsql,
+        ),
+        name="compute_installed_extensions",
+    )
diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py
deleted file mode 100644
index 4e51e7e10c..0000000000
--- a/test_runner/regress/test_installed_extensions.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from __future__ import annotations
-
-import time
-from logging import info
-from typing import TYPE_CHECKING
-
-from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
-
-if TYPE_CHECKING:
-    from fixtures.neon_fixtures import NeonEnv
-
-
-def test_installed_extensions(neon_simple_env: NeonEnv):
-    """basic test for the endpoint that returns the list of installed extensions"""
-
-    env = neon_simple_env
-
-    env.create_branch("test_installed_extensions")
-
-    endpoint = env.endpoints.create_start("test_installed_extensions")
-
-    endpoint.safe_psql("CREATE DATABASE test_installed_extensions")
-    endpoint.safe_psql("CREATE DATABASE test_installed_extensions_2")
-
-    client = endpoint.http_client()
-    res = client.installed_extensions()
-
-    info("Extensions list: %s", res)
-    info("Extensions: %s", res["extensions"])
-    # 'plpgsql' is a default extension that is always installed.
-    assert any(
-        ext["extname"] == "plpgsql" and ext["version"] == "1.0" for ext in res["extensions"]
-    ), "The 'plpgsql' extension is missing"
-
-    # check that the neon_test_utils extension is not installed
-    assert not any(
-        ext["extname"] == "neon_test_utils" for ext in res["extensions"]
-    ), "The 'neon_test_utils' extension is installed"
-
-    pg_conn = endpoint.connect(dbname="test_installed_extensions")
-    with pg_conn.cursor() as cur:
-        cur.execute("CREATE EXTENSION neon_test_utils")
-        cur.execute(
-            "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'"
-        )
-        res = cur.fetchone()
-        neon_test_utils_version = res[0]
-
-    with pg_conn.cursor() as cur:
-        cur.execute("CREATE EXTENSION neon version '1.1'")
-
-    pg_conn_2 = endpoint.connect(dbname="test_installed_extensions_2")
-    with pg_conn_2.cursor() as cur:
-        cur.execute("CREATE EXTENSION neon version '1.2'")
-
-    res = client.installed_extensions()
-
-    info("Extensions list: %s", res)
-    info("Extensions: %s", res["extensions"])
-
-    # check that the neon_test_utils extension is installed only in 1 database
-    # and has the expected version
-    assert any(
-        ext["extname"] == "neon_test_utils"
-        and ext["version"] == neon_test_utils_version
-        and ext["n_databases"] == 1
-        for ext in res["extensions"]
-    )
-
-    # check that the plpgsql extension is installed in all databases
-    # this is a default extension that is always installed
-    assert any(ext["extname"] == "plpgsql" and ext["n_databases"] == 4 for ext in res["extensions"])
-
-    # check that the neon extension is installed and has expected versions
-    for ext in res["extensions"]:
-        if ext["extname"] == "neon":
-            assert ext["version"] in ["1.1", "1.2"]
-            assert ext["n_databases"] == 1
-
-    with pg_conn.cursor() as cur:
-        cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'")
-
-    res = client.installed_extensions()
-
-    info("Extensions list: %s", res)
-    info("Extensions: %s", res["extensions"])
-
-    # check that the neon_test_utils extension is updated
-    for ext in res["extensions"]:
-        if ext["extname"] == "neon":
-            assert ext["version"] in ["1.2", "1.3"]
-            assert ext["n_databases"] == 1
-
-    # check that /metrics endpoint is available
-    # ensure that we see the metric before and after restart
-    res = client.metrics()
-    info("Metrics: %s", res)
-    m = parse_metrics(res)
-    neon_m = m.query_all(
-        "compute_installed_extensions",
-        {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
-    )
-    assert len(neon_m) == 1
-    for sample in neon_m:
-        assert sample.value == 1
-    neon_m = m.query_all(
-        "compute_installed_extensions",
-        {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
-    )
-    assert len(neon_m) == 1
-    for sample in neon_m:
-        assert sample.value == 1
-
-    endpoint.stop()
-    endpoint.start()
-
-    timeout = 10
-    while timeout > 0:
-        try:
-            res = client.metrics()
-            timeout = -1
-            if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4:
-                # Assume that not all metrics that are collected yet
-                time.sleep(1)
-                timeout -= 1
-                continue
-        except Exception:
-            log.exception("failed to get metrics, assume they are not collected yet")
-            time.sleep(1)
-            timeout -= 1
-            continue
-
-        assert (
-            len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4
-        ), "Not all metrics are collected"
-
-        info("After restart metrics: %s", res)
-        m = parse_metrics(res)
-        neon_m = m.query_all(
-            "compute_installed_extensions",
-            {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
-        )
-        assert len(neon_m) == 1
-        for sample in neon_m:
-            assert sample.value == 1
-
-        neon_m = m.query_all(
-            "compute_installed_extensions",
-            {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
-        )
-        assert len(neon_m) == 1
-        for sample in neon_m:
-            assert sample.value == 1

From 62819aca366b72409b9ae5033f739ef86c8795ff Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 29 Jan 2025 21:21:42 +0100
Subject: [PATCH 47/72] Add PostgreSQL version  17 benchmarks (#10536)

## Problem

benchmarking.yml so far is only running benchmarks with PostgreSQL
version 16.
However neon recently changed the default for new customers to
PostgreSQL version 17.

See related [epic](https://github.com/neondatabase/cloud/issues/23295)

## Summary of changes

We do not want to run every job step with both pg 16 and 17 because this
would need excessive resources (runners, computes) and extend the
benchmarking run wall clock time too much.

So we select an opinionated subset of testcases that we also report in
weekly reporting and add a postgres v17 job step.

For re-use projects associated Neon projects have been created and
connection strings have been added to neon database organization
secrets.

A follow up is to add the reporting for these new runs to some grafana
dashboards.
---
 .../workflows/_benchmarking_preparation.yml   |   5 +-
 .github/workflows/benchmarking.yml            | 128 +++++++++++++-----
 2 files changed, 95 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index fd328586b3..71aef1430e 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon, neon_pg17 ]
         database: [ clickbench, tpch, userexample ]
 
     env:
@@ -41,6 +41,9 @@ jobs:
           neon)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
+          neon_pg17)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+            ;;
           aws-rds-postgres)
             CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
             ;;
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ab0f2a6155..32747d825c 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,11 +63,15 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 16
             PLATFORM: "neon-staging"
             region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
             RUNNER: [ self-hosted, us-east-2, x64 ]
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 17
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+          - PG_VERSION: 16
             PLATFORM: "azure-staging"
             region_id: 'azure-eastus2'
             RUNNER: [ self-hosted, eastus2, x64 ]
@@ -75,7 +79,7 @@ jobs:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
+      PG_VERSION: ${{ matrix.PG_VERSION }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -112,7 +116,7 @@ jobs:
       uses: ./.github/actions/neon-project-create
       with:
         region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
 
     - name: Run benchmark
@@ -122,7 +126,7 @@ jobs:
         test_selection: performance
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
@@ -313,7 +317,11 @@ jobs:
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, 
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
@@ -329,12 +337,15 @@ jobs:
         matrix='{
           "platform": [
             "neonvm-captest-reuse"
-          ]
+          ],
+          "pg_version" : [
+            16,17
+          ],
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -346,14 +357,14 @@ jobs:
           "platform": [
             "neonvm-captest-reuse"
           ],
-          "scale": [
-            "10"
+          "pg_version" : [
+            16,17
           ]
         }'
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -378,7 +389,7 @@ jobs:
       TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
       TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -416,7 +427,7 @@ jobs:
       uses: ./.github/actions/neon-project-create
       with:
         region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
         api_key: ${{ secrets.NEON_STAGING_API_KEY }}
         compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
 
@@ -459,7 +470,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -475,7 +486,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -490,7 +501,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -505,7 +516,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -549,14 +560,19 @@ jobs:
         include:
           - PLATFORM: "neonvm-captest-pgvector"
             RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 16
+          - PLATFORM: "neonvm-captest-pgvector-pg17"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 17
           - PLATFORM: "azure-captest-pgvector"
             RUNNER: [ self-hosted, eastus2, x64 ]
+            postgres_version: 16
 
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
       TEST_PG_BENCH_SCALES_MATRIX: "1"
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.postgres_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
 
@@ -590,9 +606,13 @@ jobs:
         dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
 
         mkdir -p /tmp/neon/pg_install/v16/bin
+        mkdir -p /tmp/neon/pg_install/v17/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
         ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v17/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v17/lib
 
         LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
         export LD_LIBRARY_PATH
@@ -608,6 +628,9 @@ jobs:
           neonvm-captest-pgvector)
             CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
             ;;
+          neonvm-captest-pgvector-pg17)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_PG17 }}
+            ;;
           azure-captest-pgvector)
             CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
             ;;
@@ -634,7 +657,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -649,7 +672,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -696,7 +719,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
       TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -739,7 +762,18 @@ jobs:
       run: |
         case "${PLATFORM}" in
           neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
             ;;
           rds-aurora)
             CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
@@ -763,7 +797,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -812,12 +846,11 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
-      TEST_OLAP_SCALE: ${{ matrix.scale }}
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
@@ -849,21 +882,31 @@ jobs:
       run: |
         case "${PLATFORM}" in
           neonvm-captest-reuse)
-            ENV_PLATFORM=CAPTEST_TPCH
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
+                ;;
+              17)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17"
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
             ;;
           rds-aurora)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR"
             ;;
           rds-postgres)
-            ENV_PLATFORM=RDS_POSTGRES_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR"
             ;;
           *)
             echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
             exit 1
             ;;
         esac
-
-        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
+        
         echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
 
     - name: Set up Connection String
@@ -881,13 +924,13 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_tpch
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        TEST_OLAP_SCALE: ${{ matrix.scale }}
+        TEST_OLAP_SCALE: 10
 
     - name: Create Allure report
       id: create-allure-report
@@ -922,7 +965,7 @@ jobs:
 
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -959,7 +1002,18 @@ jobs:
       run: |
         case "${PLATFORM}" in
           neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
             ;;
           rds-aurora)
             CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
@@ -983,7 +1037,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
         aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

From de1c35fab32717a114b89e007c490fcc01faa5d9 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Wed, 29 Jan 2025 22:02:54 +0100
Subject: [PATCH 48/72] add retries for apt, wget and curl (#10553)

Ref: https://github.com/neondatabase/cloud/issues/23461

## Problem
> recent CI failure due to apt-get:
```
4.266 E: Failed to fetch http://deb.debian.org/debian/pool/main/g/gcc-10/libgfortran5_10.2.1-6_arm64.deb  Error reading from server - read (104: Connection reset by peer) [IP: 146.75.122.132 80]
```

https://github.com/neondatabase/neon/actions/runs/11144974698/job/30973537767?pr=9186
thinking about if there should be a mirror-selector at the beginning of
the dockerfile so that it uses a debian mirror closer to the build
server?
## Summary of changes
We could consider adding local mirror or proxy and keep it close to our
self-hosted runners.
For now lets just add retries for `apt`, `wget` and `curl`

thanks to @skyzh for reporting that in October 2024, I just finally
found time to take a look here :)
---
 Dockerfile                                |  2 ++
 build-tools.Dockerfile                    | 10 ++++++++++
 compute/compute-node.Dockerfile           | 13 ++++++++++++-
 docker-compose/compute_wrapper/Dockerfile |  7 ++++---
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a8f7ae0a62..7ba54c8ca5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -64,6 +64,7 @@ ARG DEFAULT_PG_VERSION
 WORKDIR /data
 
 RUN set -e \
+    && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
     && apt update \
     && apt install -y \
         libreadline-dev \
@@ -72,6 +73,7 @@ RUN set -e \
 	# System postgres for use with client libraries (e.g. in storage controller)
         postgresql-15 \
         openssl \
+    && rm -f /etc/apt/apt.conf.d/80-retries \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
     && chown -R neon:neon /data
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 7a2ec9c43e..9c13e480c1 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,6 +3,10 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         set -e && \
         apt update && \
@@ -61,6 +65,10 @@ RUN mkdir -p /pgcopydb/bin && \
 COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 # System deps
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
@@ -218,6 +226,8 @@ RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/re
 USER nonroot:nonroot
 WORKDIR /home/nonroot
 
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 # Python
 ENV PYTHON_VERSION=3.11.10 \
     PYENV_ROOT=/home/nonroot/.pyenv \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7ac6e9bc58..a428c61f34 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -18,6 +18,10 @@ ARG DEBIAN_VERSION
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 RUN case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
@@ -838,6 +842,8 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
@@ -874,6 +880,8 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
@@ -1243,6 +1251,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin c
 
 FROM debian:$DEBIAN_FLAVOR AS pgbouncer
 RUN set -e \
+    && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
     && apt update \
     && apt install --no-install-suggests --no-install-recommends -y \
         build-essential \
@@ -1444,6 +1453,8 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
 # libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc
 
 RUN apt update && \
     case $DEBIAN_VERSION in \
@@ -1500,7 +1511,7 @@ RUN set -ex; \
     else \
         echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
     fi; \
-    curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
+    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
     echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
     unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
     /tmp/awscliv2/aws/install; \
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index e2e5bc7248..61f44681da 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -7,11 +7,12 @@ FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
 ARG COMPUTE_IMAGE
 
 USER root
-RUN apt-get update &&       \
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    apt-get update &&       \
     apt-get install -y curl \
                        jq   \
                        netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
+RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
 
-USER postgres
\ No newline at end of file
+USER postgres

From ff298afb97253654f7daa5ed3c5977645a249a4f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 29 Jan 2025 22:10:56 +0100
Subject: [PATCH 49/72] pageserver: add `level` for timeline layer metrics
 (#10563)

## Problem

We don't have good observability for per-timeline compaction debt,
specifically the number of delta layers in the frozen, L0, and L1
levels.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

* Add a `level` label for `pageserver_layer_{count,size}` with values
`l0`, `l1`, and `frozen`.
* Track metrics for frozen layers.

There is already a `kind={delta,image}` label. `kind=image` is only
possible for `level=l1`.

We don't include the currently open ephemeral layer, only frozen layers.
There is always exactly 1 ephemeral layer, with a dynamic size which is
already tracked in `pageserver_timeline_ephemeral_bytes`.
---
 pageserver/src/metrics.rs                     | 236 +++++++++++-------
 pageserver/src/tenant/storage_layer/layer.rs  |  16 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 .../src/tenant/timeline/layer_manager.rs      |   8 +
 4 files changed, 154 insertions(+), 108 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d2c778276d..77c0967afc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,4 +1,13 @@
+use std::collections::HashMap;
+use std::num::NonZeroUsize;
+use std::pin::Pin;
+use std::sync::atomic::AtomicU64;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+use std::time::{Duration, Instant};
+
 use enum_map::EnumMap;
+use futures::Future;
 use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -11,13 +20,26 @@ use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
     PageServiceProtocolPipelinedExecutionStrategy,
 };
+use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
+use pin_project_lite::pin_project;
 use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
-use strum::{EnumCount, VariantNames};
+
+use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use utils::id::TimelineId;
 
+use crate::config::PageServerConf;
+use crate::context::{PageContentKind, RequestContext};
+use crate::task_mgr::TaskKind;
+use crate::tenant::layer_map::LayerMap;
+use crate::tenant::mgr::TenantSlot;
+use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
+use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
+use crate::tenant::Timeline;
+
 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
 /// queries.
@@ -443,18 +465,38 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
 #[strum(serialize_all = "kebab_case")]
-pub(crate) enum MetricLayerKind {
+pub(crate) enum LayerKind {
     Delta,
     Image,
 }
 
+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum LayerLevel {
+    // We don't track the currently open ephemeral layer, since there's always exactly 1 and its
+    // size changes. See `TIMELINE_EPHEMERAL_BYTES`.
+    Frozen,
+    L0,
+    L1,
+}
+
 static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_layer_bytes",
-        "Sum of layer physical sizes in bytes",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
     )
     .expect("failed to define a metric")
 });
@@ -462,8 +504,8 @@ static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
 static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_layer_count",
-        "Number of layers that exist",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
     )
     .expect("failed to define a metric")
 });
@@ -2590,10 +2632,6 @@ pub(crate) struct TimelineMetrics {
     pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
-    pub(crate) layer_size_image: UIntGauge,
-    pub(crate) layer_count_image: UIntGauge,
-    pub(crate) layer_size_delta: UIntGauge,
-    pub(crate) layer_count_delta: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     pub visible_physical_size_gauge: UIntGauge,
@@ -2691,42 +2729,6 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
-        let layer_size_image = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_count_image = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_size_delta = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
-        let layer_count_delta = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2791,10 +2793,6 @@ impl TimelineMetrics {
             disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
-            layer_size_image,
-            layer_count_image,
-            layer_size_delta,
-            layer_count_delta,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             visible_physical_size_gauge,
@@ -2837,6 +2835,92 @@ impl TimelineMetrics {
             .add(duration);
     }
 
+    /// Generates TIMELINE_LAYER labels for a persistent layer.
+    fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
+        let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
+            true => LayerLevel::L0,
+            false => LayerLevel::L1,
+        };
+        let kind = match layer_desc.is_delta() {
+            true => LayerKind::Delta,
+            false => LayerKind::Image,
+        };
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            level.into(),
+            kind.into(),
+        ]
+    }
+
+    /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer.
+    fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] {
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            LayerLevel::Frozen.into(),
+            LayerKind::Delta.into(), // by definition
+        ]
+    }
+
+    /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(size);
+    }
+
+    /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(size);
+    }
+
+    /// Removes a persistent layer from TIMELINE_LAYER metrics.
+    pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(layer_desc.file_size);
+    }
+
+    /// Adds a persistent layer to TIMELINE_LAYER metrics.
+    pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(layer_desc.file_size);
+    }
+
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2869,30 +2953,14 @@ impl TimelineMetrics {
         let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
+        for ref level in LayerLevel::iter() {
+            for ref kind in LayerKind::iter() {
+                let labels: [&str; 5] =
+                    [tenant_id, shard_id, timeline_id, level.into(), kind.into()];
+                let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels);
+                let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels);
+            }
+        }
 
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2974,24 +3042,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
     // we leave the BROKEN_TENANTS_SET entry if any
 }
 
-use futures::Future;
-use pin_project_lite::pin_project;
-use std::collections::HashMap;
-use std::num::NonZeroUsize;
-use std::pin::Pin;
-use std::sync::atomic::AtomicU64;
-use std::sync::{Arc, Mutex};
-use std::task::{Context, Poll};
-use std::time::{Duration, Instant};
-
-use crate::config::PageServerConf;
-use crate::context::{PageContentKind, RequestContext};
-use crate::task_mgr::TaskKind;
-use crate::tenant::mgr::TenantSlot;
-use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::throttle::ThrottleResult;
-use crate::tenant::Timeline;
-
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
     last_set: AtomicU64,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2a86885f6b..99e0ff1aa5 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -701,13 +701,7 @@ impl Drop for LayerInner {
         if let Some(timeline) = timeline.as_ref() {
             // Only need to decrement metrics if the timeline still exists: otherwise
             // it will have already de-registered these metrics via TimelineMetrics::shutdown
-            if self.desc.is_delta() {
-                timeline.metrics.layer_count_delta.dec();
-                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
-            } else {
-                timeline.metrics.layer_count_image.dec();
-                timeline.metrics.layer_size_image.sub(self.desc.file_size);
-            }
+            timeline.metrics.dec_layer(&self.desc);
 
             if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
                 debug_assert!(
@@ -817,13 +811,7 @@ impl LayerInner {
         };
 
         // This object acts as a RAII guard on these metrics: increment on construction
-        if desc.is_delta() {
-            timeline.metrics.layer_count_delta.inc();
-            timeline.metrics.layer_size_delta.add(desc.file_size);
-        } else {
-            timeline.metrics.layer_count_image.inc();
-            timeline.metrics.layer_size_image.add(desc.file_size);
-        }
+        timeline.metrics.inc_layer(&desc);
 
         // New layers are visible by default. This metric is later updated on drop or in set_visibility
         timeline
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 24bc7890c6..b4b30fcd23 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3703,7 +3703,7 @@ impl Timeline {
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?
-                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
+                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
                 .await
         };
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index f1cef7778c..cb7783d779 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -91,6 +91,7 @@ impl LayerManager {
                 layer_map,
                 layer_fmgr: LayerFileManager(hashmap),
             }) => {
+                // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown.
                 let open = layer_map.open_layer.take();
                 let frozen = layer_map.frozen_layers.len();
                 let taken_writer_state = writer_state.take();
@@ -234,6 +235,7 @@ impl OpenLayerManager {
         lsn: Lsn,
         last_freeze_at: &AtomicLsn,
         write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+        metrics: &TimelineMetrics,
     ) -> bool {
         let Lsn(last_record_lsn) = lsn;
         let end_lsn = Lsn(last_record_lsn + 1);
@@ -242,6 +244,11 @@ impl OpenLayerManager {
             let open_layer_rc = Arc::clone(open_layer);
             open_layer.freeze(end_lsn).await;
 
+            // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
+            // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
+            // reference to the timeline metrics. Other methods use a metrics borrow as well.
+            metrics.inc_frozen_layer(open_layer);
+
             // The layer is no longer open, update the layer map to reflect this.
             // We will replace it with on-disk historics below.
             self.layer_map.frozen_layers.push_back(open_layer_rc);
@@ -298,6 +305,7 @@ impl OpenLayerManager {
             .frozen_layers
             .pop_front()
             .expect("there must be a inmem layer to flush");
+        metrics.dec_frozen_layer(&inmem);
 
         // Only one task may call this function at a time (for this
         // timeline). If two tasks tried to flush the same frozen

From 9dff6cc2a4d8c2ddb645df6872b3ac1985944264 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 29 Jan 2025 16:32:50 -0500
Subject: [PATCH 50/72] fix(pageserver): skip repartition if we need L0
 compaction (#10547)

## Problem

Repartition is slow, but it's only used in image layer creation. We can
skip it if we have a lot of L0 layers to ingest.

## Summary of changes

If L0 compaction is not complete, do not repartition and do not create
image layers.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 107 ++++++++++---------
 1 file changed, 57 insertions(+), 50 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 589aea18b4..5f0e6dc3ec 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -624,7 +624,13 @@ impl Timeline {
 
         // High level strategy for compaction / image creation:
         //
-        // 1. First, calculate the desired "partitioning" of the
+        // 1. First, do a L0 compaction to ensure we move the L0
+        // layers into the historic layer map get flat levels of
+        // layers. If we did not compact all L0 layers, we will
+        // prioritize compacting the timeline again and not do
+        // any of the compactions below.
+        //
+        // 2. Then, calculate the desired "partitioning" of the
         // currently in-use key space. The goal is to partition the
         // key space into roughly fixed-size chunks, but also take into
         // account any existing image layers, and try to align the
@@ -638,7 +644,7 @@ impl Timeline {
         // identify a relation. This is just an optimization,
         // though.
         //
-        // 2. Once we know the partitioning, for each partition,
+        // 3. Once we know the partitioning, for each partition,
         // decide if it's time to create a new image layer. The
         // criteria is: there has been too much "churn" since the last
         // image layer? The "churn" is fuzzy concept, it's a
@@ -646,15 +652,8 @@ impl Timeline {
         // total in the delta file. Or perhaps: if creating an image
         // file would allow to delete some older files.
         //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
+        // 4. In the end, if the tenant gets auto-sharded, we will run
+        // a shard-ancestor compaction.
 
         // Is the timeline being deleted?
         if self.is_stopping() {
@@ -666,10 +665,32 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
-        // FIXME: the match should only cover repartitioning, not the next steps
-        let (partition_count, has_pending_tasks) = match self
+        // 1. L0 Compact
+        let fully_compacted = {
+            let timer = self.metrics.compact_time_histo.start_timer();
+            let fully_compacted = self
+                .compact_level0(
+                    target_file_size,
+                    options.flags.contains(CompactFlags::ForceL0Compaction),
+                    ctx,
+                )
+                .await?;
+            timer.stop_and_record();
+            fully_compacted
+        };
+
+        if !fully_compacted {
+            // Yield and do not do any other kind of compaction. True means
+            // that we have pending L0 compaction tasks and the compaction scheduler
+            // will prioritize compacting this tenant/timeline again.
+            info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
+            return Ok(true);
+        }
+
+        // 2. Repartition and create image layers if necessary
+        let partition_count = match self
             .repartition(
-                self.get_last_record_lsn(),
+                self.get_last_record_lsn(), // TODO: use L0-L1 boundary
                 self.get_compaction_target_size(),
                 options.flags,
                 ctx,
@@ -682,46 +703,30 @@ impl Timeline {
                     .access_stats_behavior(AccessStatsBehavior::Skip)
                     .build();
 
-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                let fully_compacted = self
-                    .compact_level0(
-                        target_file_size,
-                        options.flags.contains(CompactFlags::ForceL0Compaction),
-                        ctx,
-                    )
-                    .await?;
-                timer.stop_and_record();
-
                 let mut partitioning = dense_partitioning;
                 partitioning
                     .parts
                     .extend(sparse_partitioning.into_dense().parts);
 
-                // 3. Create new image layers for partitions that have been modified
-                // "enough". Skip image layer creation if L0 compaction cannot keep up.
-                if fully_compacted {
-                    let image_layers = self
-                        .create_image_layers(
-                            &partitioning,
-                            lsn,
-                            if options
-                                .flags
-                                .contains(CompactFlags::ForceImageLayerCreation)
-                            {
-                                ImageLayerCreationMode::Force
-                            } else {
-                                ImageLayerCreationMode::Try
-                            },
-                            &image_ctx,
-                        )
-                        .await?;
+                // 3. Create new image layers for partitions that have been modified "enough".
+                let image_layers = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        if options
+                            .flags
+                            .contains(CompactFlags::ForceImageLayerCreation)
+                        {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                    )
+                    .await?;
 
-                    self.upload_new_image_layers(image_layers)?;
-                } else {
-                    info!("skipping image layer generation due to L0 compaction did not include all layers.");
-                }
-                (partitioning.parts.len(), !fully_compacted)
+                self.upload_new_image_layers(image_layers)?;
+                partitioning.parts.len()
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -733,10 +738,12 @@ impl Timeline {
                 if !self.cancel.is_cancelled() && !err.is_cancelled() {
                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
-                (1, false)
+                1
             }
         };
 
+        // 4. Shard ancestor compaction
+
         if self.shard_identity.count >= ShardCount::new(2) {
             // Limit the number of layer rewrites to the number of partitions: this means its
             // runtime should be comparable to a full round of image layer creations, rather than
@@ -746,7 +753,7 @@ impl Timeline {
             self.compact_shard_ancestors(rewrite_max, ctx).await?;
         }
 
-        Ok(has_pending_tasks)
+        Ok(false)
     }
 
     /// Check for layers that are elegible to be rewritten:

From 77ea9b16fe5ba35dea47358a36bdb22bf51b66ec Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 29 Jan 2025 19:05:40 -0500
Subject: [PATCH 51/72] fix(pageserver): use the larger one of upper limit and
 threshold (#10571)

## Problem

Follow up of https://github.com/neondatabase/neon/pull/10550 in case the
upper limit is set larger than threshold. It does not make sense for
someone to enforce the behavior like "if there are >= 50 L0s, only
compact 10 of them".

## Summary of changes

Use the maximum of compaction threshold and upper limit when selecting
L0 files to compact.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5f0e6dc3ec..5f7b5f1af5 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1122,7 +1122,13 @@ impl Timeline {
         // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size
         // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
         // work in this function to only operate on this much delta data at once.
-        let delta_size_limit = self.get_compaction_upper_limit() as u64
+        //
+        // In general, compaction_threshold should be <= compaction_upper_limit, but in case that
+        // the constraint is not respected, we use the larger of the two.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_upper_limit(),
+            self.get_compaction_threshold(),
+        ) as u64
             * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
 
         let mut fully_compacted = true;

From a7a706cff7c9e11ac93d6ed487988c3dd0b74569 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <exclusion@gmail.com>
Date: Thu, 30 Jan 2025 11:09:43 +0200
Subject: [PATCH 52/72] Fix submodule reference after #10473 (#10577)

---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index b654fa88b6..f0ffc8279d 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit b654fa88b6fd2ad24a03a14a7cd417ec66e518f9
+Subproject commit f0ffc8279dbcbbc439981a4fd001a9687e5d665d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 982f537692..c3eaeac927 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9"
+    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
   ],
   "v16": [
     "16.6",

From b24727134c5eebd9965297c1a65618cfda5c4a52 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 10:27:40 +0100
Subject: [PATCH 53/72] pageserver: improve read amp metric (#10573)

## Problem

The current global `pageserver_layers_visited_per_vectored_read_global`
metric does not appear to accurately measure read amplification. It
divides the layer count by the number of reads in a batch, but this
means that e.g. 10 reads with 100 L0 layers will only measure a read amp
of 10 per read, while the actual read amp was 100.

While the cost of layer visits are amortized across the batch, and some
layers may not intersect with a given key, each visited layer
contributes directly to the observed latency for every read in the
batch, which is what we care about.

Touches https://github.com/neondatabase/cloud/issues/23283.
Extracted from #10566.

## Summary of changes

* Count the number of layers visited towards each read in the batch,
instead of the average across the batch.
* Rename `pageserver_layers_visited_per_vectored_read_global` to
`pageserver_layers_per_read_global`.
* Reduce the read amp log warning threshold down from 512 to 100.
---
 pageserver/src/metrics.rs              | 16 ++++++++-----
 pageserver/src/tenant/timeline.rs      | 31 ++++++++++++++------------
 test_runner/regress/test_compaction.py | 10 ++++-----
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 77c0967afc..30f3a49a9d 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -116,11 +116,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+/// Measures layers visited per read (i.e. read amplification).
+///
+/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
+/// are amortized across the batch, and some layers may not intersect with a given key, each visited
+/// layer contributes directly to the observed latency for every read in the batch, which is what we
+/// care about.
+pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
-        "pageserver_layers_visited_per_vectored_read_global",
-        "Average number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+        "pageserver_layers_per_read_global",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
     )
     .expect("failed to define a metric")
 });
@@ -3912,7 +3918,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
 
     // histograms
     [
-        &VEC_READ_NUM_LAYERS_VISITED,
+        &LAYERS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b4b30fcd23..71f0b4c9bf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,6 +51,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::rate_limit::RateLimit;
 use utils::{
     fs_ext,
     guard_arc_swap::GuardArcSwap,
@@ -115,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -1044,7 +1045,7 @@ impl Timeline {
     }
 
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
-    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
+    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
 
     /// Look up multiple page versions at a given LSN
     ///
@@ -1221,25 +1222,27 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
-            let avg = layers_visited as f64 / results.len() as f64;
-            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
+            // Record the total number of layers visited towards each key in the batch. While some
+            // layers may not intersect with a given read, and the cost of layer visits are
+            // amortized across the batch, each visited layer contributes directly to the observed
+            // latency for every read in the batch, which is what we care about.
+            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
+                LOG_PACER.lock().unwrap().call(|| {
+                    let num_keys = keyspace.total_raw_size();
+                    let num_pages = results.len();
                     tracing::info!(
                       shard_id = %self.tenant_shard_id.shard_slug(),
                       lsn = %lsn,
-                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                    );
                 });
             }
 
-            // Note that this is an approximation. Tracking the exact number of layers visited
-            // per key requires virtually unbounded memory usage and is inefficient
-            // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
+            for _ in &results {
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+            }
         }
 
         Ok(results)
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 2edfc884ad..763a63c2e5 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -86,9 +86,9 @@ page_cache_size=10
     log.info("Checking layer access metrics ...")
 
     layer_access_metric_names = [
-        "pageserver_layers_visited_per_vectored_read_global_sum",
-        "pageserver_layers_visited_per_vectored_read_global_count",
-        "pageserver_layers_visited_per_vectored_read_global_bucket",
+        "pageserver_layers_per_read_global_sum",
+        "pageserver_layers_per_read_global_count",
+        "pageserver_layers_per_read_global_bucket",
     ]
 
     metrics = env.pageserver.http_client().get_metrics()
@@ -96,8 +96,8 @@ page_cache_size=10
         layer_access_metrics = metrics.query_all(name)
         log.info(f"Got metrics: {layer_access_metrics}")
 
-    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
-    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_sum = metrics.query_one("pageserver_layers_per_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_per_read_global_count")
     if vectored_count.value > 0:
         assert vectored_sum.value > 0
         vectored_average = vectored_sum.value / vectored_count.value

From d3db96c2112381e7a0601719e2e7d55d11d946ed Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 11:55:07 +0100
Subject: [PATCH 54/72] pageserver: add `pageserver_deltas_per_read_global`
 metric (#10570)

## Problem

We suspect that Postgres checkpoints will limit the number of page
deltas necessary to reconstruct a page, but don't know for certain.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add `pageserver_deltas_per_read_global` metric.

This pairs with `pageserver_layers_per_read_global` from #10573.
---
 pageserver/src/metrics.rs              | 11 +++++++++++
 pageserver/src/tenant/storage_layer.rs | 10 ++++++++++
 pageserver/src/tenant/timeline.rs      |  3 ++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 30f3a49a9d..bf75ea3bc6 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -131,6 +131,16 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
+    register_histogram!(
+        "pageserver_deltas_per_read_global",
+        "Number of delta pages applied to image page per read",
+        vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
     register_uint_gauge!(
         "pageserver_concurrent_initdb",
@@ -3919,6 +3929,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
     // histograms
     [
         &LAYERS_PER_READ_GLOBAL,
+        &DELTAS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index c1fe67c87c..3800852ccc 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -80,6 +80,16 @@ pub(crate) struct ValueReconstructState {
     pub(crate) img: Option<(Lsn, Bytes)>,
 }
 
+impl ValueReconstructState {
+    /// Returns the number of page deltas applied to the page image.
+    pub fn num_deltas(&self) -> usize {
+        match self.img {
+            Some(_) => self.records.len(),
+            None => self.records.len() - 1, // omit will_init record
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub(crate) enum ValueReconstructSituation {
     Complete,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 71f0b4c9bf..4a69376239 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -116,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL};
+use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -1195,6 +1195,7 @@ impl Timeline {
                             return (key, Err(err));
                         }
                     };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
 
                     // The walredo module expects the records to be descending in terms of Lsn.
                     // And we submit the IOs in that order, so, there shuold be no need to sort here.

From 8804d5894336762d077147e2dac9b780653f2a8c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 30 Jan 2025 11:18:07 +0000
Subject: [PATCH 55/72] Nightly Benchmarks: use pgbench from artifacts 
 (#10370)

We don't use statically linked OpenSSL anymore (#10302),
it's ok to switch to Neon's pgbench for pgvector benchmarks
---
 .github/workflows/benchmarking.yml | 55 +++++++++---------------------
 1 file changed, 16 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 32747d825c..9446b4d17b 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -319,7 +319,7 @@ jobs:
                       { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, 
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
@@ -458,7 +458,7 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB 
+    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB
     # without (neonvm-captest-new)
     # and with (neonvm-captest-new-many-tables) many relations in the database
     - name: Create many relations before the run
@@ -590,36 +590,20 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
-    # instead of using Neon artifacts containing pgbench
-    - name: Install postgresql-16 where pytest expects it
-      run: |
-        # Just to make it easier to test things locally on macOS (with arm64)
-        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
 
-        cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
-
-        mkdir -p /tmp/neon/pg_install/v16/bin
-        mkdir -p /tmp/neon/pg_install/v17/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v17/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v17/lib
-
-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
-        export LD_LIBRARY_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
-
-        /tmp/neon/pg_install/v16/bin/pgbench --version
-        /tmp/neon/pg_install/v16/bin/psql --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Set up Connection String
       id: set-up-connstr
@@ -642,13 +626,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
-
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
       with:
@@ -906,7 +883,7 @@ jobs:
             exit 1
             ;;
         esac
-        
+
         echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
 
     - name: Set up Connection String

From 6a2afa0c0216c183e51d68d2730ed67862247de2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 30 Jan 2025 12:24:49 +0100
Subject: [PATCH 56/72] pageserver: add per-timeline read amp histogram
 (#10566)

## Problem

We don't have per-timeline observability for read amplification.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add a per-timeline `pageserver_layers_per_read` histogram.

NB: per-timeline histograms are expensive, but probably worth it in this
case.
---
 pageserver/src/metrics.rs         | 19 +++++++++++++++++++
 pageserver/src/tenant/timeline.rs |  1 +
 test_runner/fixtures/metrics.py   |  3 +++
 3 files changed, 23 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index bf75ea3bc6..f9edf88553 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -122,6 +122,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 /// are amortized across the batch, and some layers may not intersect with a given key, each visited
 /// layer contributes directly to the observed latency for every read in the batch, which is what we
 /// care about.
+pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_layers_per_read",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        // Low resolution to reduce cardinality.
+        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_layers_per_read_global",
@@ -2648,6 +2659,7 @@ pub(crate) struct TimelineMetrics {
     pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
+    pub layers_per_read: Histogram,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     pub visible_physical_size_gauge: UIntGauge,
@@ -2745,6 +2757,10 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let layers_per_read = LAYERS_PER_READ
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2809,6 +2825,7 @@ impl TimelineMetrics {
             disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
+            layers_per_read,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             visible_physical_size_gauge,
@@ -2978,6 +2995,8 @@ impl TimelineMetrics {
             }
         }
 
+        let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a69376239..ab4b3cac63 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1242,6 +1242,7 @@ impl Timeline {
             }
 
             for _ in &results {
+                self.metrics.layers_per_read.observe(layers_visited as f64);
                 LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
             }
         }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index fd7e193778..83a1a87611 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -158,6 +158,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",
     "pageserver_layer_count",
+    "pageserver_layers_per_read_bucket",
+    "pageserver_layers_per_read_count",
+    "pageserver_layers_per_read_sum",
     "pageserver_visible_physical_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",

From ab627ad9fd355e6a84a7403accee307284f8e017 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 11:54:02 +0000
Subject: [PATCH 57/72] storcon_cli: fix spurious error setting preferred AZ
 (#10568)

## Problem

The client code for `tenant-set-preferred-az` declared response type
`()`, so printed a spurious error on each use:
```
Error: receive body: error decoding response body: invalid type: map, expected unit at line 1 column 0
```

The requests were successful anyway.

## Summary of changes

- Declare the proper return type, so that the command succeeds quietly.
---
 control_plane/storcon_cli/src/main.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index d9b76b9600..985fe6b3b1 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -10,8 +10,8 @@ use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
         SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
-        TenantPolicyRequest,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -800,7 +800,7 @@ async fn main() -> anyhow::Result<()> {
                     .collect(),
             };
             storcon_client
-                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                .dispatch::<ShardsPreferredAzsRequest, ShardsPreferredAzsResponse>(
                     Method::PUT,
                     "control/v1/preferred_azs".to_string(),
                     Some(req),

From 93714c4c7bc472a80eb0e13c6e33ea3bf19389d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Jan 2025 13:03:36 +0100
Subject: [PATCH 58/72] secondary downloader: load metadata on loading of
 timeline (#10539)

Related to #10308, we might have legitimate changes in file size or
generation. Those changes should not cause warn log lines.

In order to detect changes of the generation number while the file size
stayed the same, load the metadata that we store on disk on loading of
the timeline.

Still do a comparison with the on-disk layer sizes to find any
discrepancies that might occur due to race conditions (new metadata file
gets written but layer file has not been updated yet, and PS shuts
down). However, as it's possible to hit it in a race conditon, downgrade
it to a warning.

Also fix a mistake in #10529: we want to compare the old with the new
metadata, not the old metadata with itself.
---
 pageserver/src/tenant/secondary/downloader.rs | 101 ++++++++++++++----
 pageserver/src/virtual_file.rs                |  53 +++++----
 2 files changed, 114 insertions(+), 40 deletions(-)

diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index cf524fcb25..2e8c3946bd 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -673,12 +673,30 @@ impl<'a> TenantDownloader<'a> {
             HeatMapDownload::Modified(m) => m,
         };
 
-        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
-
-        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
-        // layer metadata without having to re-download it.
+        // Heatmap storage location
         let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
 
+        let last_heatmap = if last_download.is_none() {
+            match load_heatmap(&heatmap_path, ctx).await {
+                Ok(htm) => htm,
+                Err(e) => {
+                    tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
+        let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| {
+            htm.timelines
+                .iter()
+                .map(|tl| (tl.timeline_id, tl))
+                .collect::<HashMap<_, _>>()
+        });
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
         let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
         let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
         let heatmap_path_bg = heatmap_path.clone();
@@ -707,10 +725,17 @@ impl<'a> TenantDownloader<'a> {
             let timeline_state = match timeline_state {
                 Some(t) => t,
                 None => {
+                    let last_heatmap =
+                        last_heatmap_timelines
+                            .as_ref()
+                            .and_then(|last_heatmap_timelines| {
+                                last_heatmap_timelines.get(&timeline.timeline_id).copied()
+                            });
                     // We have no existing state: need to scan local disk for layers first.
                     let timeline_state = init_timeline_state(
                         self.conf,
                         tenant_shard_id,
+                        last_heatmap,
                         timeline,
                         &self.secondary_state.resident_size_metric,
                     )
@@ -1079,12 +1104,12 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
+            if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() {
                 tracing::info!(
                     "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
                     layer.name,
                     on_disk.metadata.generation_file_size(),
-                    on_disk.metadata.generation_file_size()
+                    layer.metadata.generation_file_size()
                 );
                 return LayerAction::Download;
             }
@@ -1277,6 +1302,7 @@ impl<'a> TenantDownloader<'a> {
 async fn init_timeline_state(
     conf: &'static PageServerConf,
     tenant_shard_id: &TenantShardId,
+    last_heatmap: Option<&HeatMapTimeline>,
     heatmap: &HeatMapTimeline,
     resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
@@ -1306,6 +1332,13 @@ async fn init_timeline_state(
     let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         heatmap.layers.iter().map(|l| (&l.name, l)).collect();
 
+    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
+        if let Some(last_heatmap) = last_heatmap {
+            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+        } else {
+            HashMap::new()
+        };
+
     while let Some(dentry) = dir
         .next_entry()
         .await
@@ -1339,18 +1372,32 @@ async fn init_timeline_state(
         match LayerName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
+                let last_meta = last_heatmap_metadata.get(&name);
+                let mut remove = false;
                 match remote_meta {
                     Some(remote_meta) => {
+                        let last_meta_generation_file_size = last_meta
+                            .map(|m| m.metadata.generation_file_size())
+                            .unwrap_or(remote_meta.metadata.generation_file_size());
                         // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if local_meta.len() != remote_meta.metadata.file_size {
-                            // This should not happen, because we do crashsafe write-then-rename when downloading
-                            // layers, and layers in remote storage are immutable.  Remove the local file because
-                            // we cannot trust it.
-                            tracing::warn!(
+                        if remote_meta.metadata.generation_file_size()
+                            != last_meta_generation_file_size
+                        {
+                            tracing::info!(
+                                "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}",
+                                last_meta_generation_file_size,
+                                remote_meta.metadata.generation_file_size()
+                            );
+                            remove = true;
+                        } else if local_meta.len() != remote_meta.metadata.file_size {
+                            // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had
+                            // the chance yet to download the new layer to disk, before the process restarted.
+                            tracing::info!(
                                 "Removing local layer {name} with unexpected local size {} != {}",
                                 local_meta.len(),
                                 remote_meta.metadata.file_size
                             );
+                            remove = true;
                         } else {
                             // We expect the access time to be initialized immediately afterwards, when
                             // the latest heatmap is applied to the state.
@@ -1372,15 +1419,18 @@ async fn init_timeline_state(
                             "Removing secondary local layer {} because it's absent in heatmap",
                             name
                         );
-                        tokio::fs::remove_file(&dentry.path())
-                            .await
-                            .or_else(fs_ext::ignore_not_found)
-                            .fatal_err(&format!(
-                                "Removing layer {}",
-                                dentry.path().to_string_lossy()
-                            ));
+                        remove = true;
                     }
                 }
+                if remove {
+                    tokio::fs::remove_file(&dentry.path())
+                        .await
+                        .or_else(fs_ext::ignore_not_found)
+                        .fatal_err(&format!(
+                            "Removing layer {}",
+                            dentry.path().to_string_lossy()
+                        ));
+                }
             }
             Err(_) => {
                 // Ignore it.
@@ -1391,3 +1441,18 @@ async fn init_timeline_state(
 
     detail
 }
+
+/// Loads a json-encoded heatmap file from the provided on-disk path
+async fn load_heatmap(
+    path: &Utf8PathBuf,
+    ctx: &RequestContext,
+) -> Result<Option<HeatMapTenant>, anyhow::Error> {
+    let mut file = match VirtualFile::open(path, ctx).await {
+        Ok(file) => file,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
+        Err(e) => Err(e)?,
+    };
+    let st = file.read_to_string(ctx).await?;
+    let htm = serde_json::from_str(&st)?;
+    Ok(Some(htm))
+}
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 8a7f4a4bf5..9d539198c7 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -234,6 +234,19 @@ impl VirtualFile {
     ) -> (FullSlice<Buf>, Result<usize, Error>) {
         self.inner.write_all(buf, ctx).await
     }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
+
+    pub(crate) async fn read_to_string(
+        &mut self,
+        ctx: &RequestContext,
+    ) -> Result<String, anyhow::Error> {
+        let mut buf = Vec::new();
+        self.read_to_end(&mut buf, ctx).await?;
+        Ok(String::from_utf8(buf)?)
+    }
 }
 
 /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
@@ -993,6 +1006,24 @@ impl VirtualFileInner {
             (buf, result)
         })
     }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
+        loop {
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
+            match res {
+                Ok(0) => return Ok(()),
+                Ok(n) => {
+                    self.pos += n as u64;
+                    buf.extend_from_slice(&slice[..n]);
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+            tmp = slice.into_inner();
+        }
+    }
 }
 
 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1237,10 +1268,6 @@ impl VirtualFile {
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         self.inner.read_blk(blknum, ctx).await
     }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        self.inner.read_to_end(buf, ctx).await
-    }
 }
 
 #[cfg(test)]
@@ -1260,24 +1287,6 @@ impl VirtualFileInner {
             slice.into_inner(),
         ))
     }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        let mut tmp = vec![0; 128];
-        loop {
-            let slice = tmp.slice(..128);
-            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
-            match res {
-                Ok(0) => return Ok(()),
-                Ok(n) => {
-                    self.pos += n as u64;
-                    buf.extend_from_slice(&slice[..n]);
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-            tmp = slice.into_inner();
-        }
-    }
 }
 
 impl Drop for VirtualFileInner {

From be51b10da73cf0e7c3109e8b6b2f5f3ad9f219b9 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 30 Jan 2025 15:31:49 +0100
Subject: [PATCH 59/72] chore(compute): Print some compute_ctl errors in debug
 mode (#10586)

## Problem

In some cases, we were returning a very shallow error like `error
sending request for url (XXX)`, which made it very hard to figure out
the actual error.

## Summary of changes

Use `{:?}` in a few places, and remove it from places where we were
printing a string anyway.
---
 compute_tools/src/extension_server.rs | 14 +++++++-------
 compute_tools/src/migration.rs        |  2 +-
 compute_tools/src/spec.rs             |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 64c338f4d7..00f46386e7 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -258,14 +258,11 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
     let uri = format!("{}/{}", ext_remote_storage, ext_path);
 
-    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+    info!("Download extension {} from uri {}", ext_path, uri);
 
     match do_extension_server_request(&uri).await {
         Ok(resp) => {
-            info!(
-                "Successfully downloaded remote extension data {:?}",
-                ext_path
-            );
+            info!("Successfully downloaded remote extension data {}", ext_path);
             REMOTE_EXT_REQUESTS_TOTAL
                 .with_label_values(&[&StatusCode::OK.to_string()])
                 .inc();
@@ -285,7 +282,10 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
     let resp = reqwest::get(uri).await.map_err(|e| {
         (
-            format!("could not perform remote extensions server request: {}", e),
+            format!(
+                "could not perform remote extensions server request: {:?}",
+                e
+            ),
             UNKNOWN_HTTP_STATUS.to_string(),
         )
     })?;
@@ -295,7 +295,7 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String
         StatusCode::OK => match resp.bytes().await {
             Ok(resp) => Ok(resp),
             Err(e) => Err((
-                format!("could not read remote extensions server response: {}", e),
+                format!("could not read remote extensions server response: {:?}", e),
                 // It's fine to return and report error with status as 200 OK,
                 // because we still failed to read the response.
                 status.to_string(),
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
index aa3c6b01f0..7b7b042d84 100644
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -125,7 +125,7 @@ impl<'m> MigrationRunner<'m> {
                     info!("Finished migration id={}", migration_id);
                 }
                 Err(e) => {
-                    error!("Failed to run migration id={}: {}", migration_id, e);
+                    error!("Failed to run migration id={}: {:?}", migration_id, e);
                     DB_MIGRATION_FAILED
                         .with_label_values(&[migration_id.to_string().as_str()])
                         .inc();
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 43a820885b..37d5d3a1a6 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -28,7 +28,7 @@ fn do_control_plane_request(
         .map_err(|e| {
             (
                 true,
-                format!("could not perform spec request to control plane: {}", e),
+                format!("could not perform spec request to control plane: {:?}", e),
                 UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
@@ -39,7 +39,7 @@ fn do_control_plane_request(
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
-                format!("could not deserialize control plane response: {}", e),
+                format!("could not deserialize control plane response: {:?}", e),
                 status.to_string(),
             )),
         },

From cf6dee946efa212078cbc6781549f4d5e27e8255 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 30 Jan 2025 10:25:29 -0500
Subject: [PATCH 60/72] fix(pageserver): gc-compaction race with read (#10543)

## Problem

close https://github.com/neondatabase/neon/issues/10482

## Summary of changes

Add an extra lock on the read path to protect against races. The read
path has an implication that only certain kind of compactions can be
performed. Garbage keys must first have an image layer covering the
range, and then being gc-ed -- they cannot be done in one operation. An
alternative to fix this is to move the layers read guard to be acquired
at the beginning of `get_vectored_reconstruct_data_timeline`, but that
was intentionally optimized out and I don't want to regress.

The race is not limited to image layers. Gc-compaction will consolidate
deltas automatically and produce a flat delta layer (i.e., when we have
retain_lsns below the gc-horizon). The same race would also cause
behaviors like getting an un-replayable key history as in
https://github.com/neondatabase/neon/issues/10049.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            |  6 ++++
 pageserver/src/tenant/timeline/compaction.rs | 37 +++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ab4b3cac63..f387c81c29 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -341,6 +341,8 @@ pub struct Timeline {
     // Needed to ensure that we can't create a branch at a point that was already garbage collected
     pub latest_gc_cutoff_lsn: Rcu<Lsn>,
 
+    pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
+
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
     pub(crate) gc_info: std::sync::RwLock<GcInfo>,
@@ -2437,6 +2439,7 @@ impl Timeline {
                 shard_identity,
                 pg_version,
                 layers: Default::default(),
+                gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()),
 
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
@@ -3480,6 +3483,9 @@ impl Timeline {
         // image layer).
         let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
 
+        // See `compaction::compact_with_gc` for why we need this.
+        let _guard = timeline.gc_compaction_layer_update_lock.read().await;
+
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5f7b5f1af5..7242f73a82 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2919,10 +2919,45 @@ impl Timeline {
         // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
         // operate on L1 layers.
         {
+            // Gc-compaction will rewrite the history of a key. This could happen in two ways:
+            //
+            // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume
+            // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace
+            // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer
+            // map, the read path then cannot find any keys below A, reporting a missing key error, while the key
+            // now gets stored in I at the compact LSN.
+            //
+            // ---------------                                       ---------------
+            //   delta1@LSN20                                         image1@LSN20
+            // ---------------  (read path collects delta@LSN20,  => ---------------  (read path cannot find anything
+            //   delta1@LSN10    yields)                                               below LSN 20)
+            // ---------------
+            //
+            // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers,
+            // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4,
+            // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4.
+            //
+            // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that
+            // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta
+            // layer containing 4, yields, and we update the layer map to put the delta layer.
+            //
+            // ---------------                                      ---------------
+            //   delta1@LSN4                                          image1@LSN4
+            // ---------------  (read path collects delta@LSN4,  => ---------------  (read path collects LSN4 and LSN1,
+            //  delta1@LSN1-3    yields)                              delta1@LSN1     which is an invalid history)
+            // ---------------                                      ---------------
+            //
+            // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads,
+            // and only allow reads to continue after the update is finished.
+
+            let update_guard = self.gc_compaction_layer_update_lock.write().await;
+            // Acquiring the update guard ensures current read operations end and new read operations are blocked.
+            // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?
-                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
+            drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map.
         };
 
         // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json.

From efe42db264fc00e9ea13d16851452a2c7c1e58a5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 30 Jan 2025 18:11:26 +0200
Subject: [PATCH 61/72] tests: test_pgdata_import_smoke requires the 'testing'
 cargo feature (#10569)

It took me ages to figure out why it was failing on my laptop. What I
saw was that when the test makes the 'import_pgdata' in the pageserver,
the pageserver actually performs a regular 'bootstrap' timeline creation
by running initdb, with no importing. It boiled down to the json request
that the test uses:

```
        {
            "new_timeline_id": str(timeline_id),
            "import_pgdata": {
                "idempotency_key": str(idempotency),
                "location": {"LocalFs": {"path": str(importbucket.absolute())}},
            },
        },
```

and how serde deserializes into rust structs. The 'LocalFs' enum variant
in `models.rs` is gated on the 'testing' cargo feature. On a non-testing
build, that got deserialized into the default Bootstrap enum variant, as
a valid TimelineCreateRequestModeImportPgdata variant could not be
formed.

PS. IMHO we should get rid of the testing feature, compile in all the
functionality, and have a runtime flag to disable anything dangeorous.
With that, you would've gotten a nice "feature only enabled in testing
mode" error in this case, or the test would've simply worked. But that's
another story.
---
 test_runner/regress/test_import_pgdata.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 182f715b0e..086d4b67c9 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -59,6 +59,9 @@ def test_pgdata_import_smoke(
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
+    # The test needs LocalFs support, which is only built in testing mode.
+    env.pageserver.is_testing_enabled_or_skip()
+
     env.pageserver.patch_config_toml_nonrecursive(
         {
             "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api"

From 6c8fc909d68f1e541c42f00162526ff1e59f6237 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 30 Jan 2025 17:41:46 +0100
Subject: [PATCH 62/72] Benchmarking PostgreSQL17: for OLAP need specific
 connstr secrets  (#10587)

## Problem

for OLAP benchmarks we need specific connstr secrets with different
database names for each job step

This is a follow-up for https://github.com/neondatabase/neon/pull/10536
In previous PR we used a common GitHub secret for a shared re-use
project that has 4 databases: neondb, tpch, clickbench and userexamples.

[Failure
example](https://neon-github-public-dev.s3.amazonaws.com/reports/main/13044872855/index.html#suites/54d0af6f403f1d8611e8894c2e07d023/fc029330265e9f6e/):


```log
# /tmp/neon/pg_install/v17/bin/psql user=neondb_owner dbname=neondb host=ep-broad-brook-w2luwzzv.us-east-2.aws.neon.build sslmode=require options='-cstatement_timeout=0 ' -c -- $ID$
-- TPC-H/TPC-R Pricing Summary Report Query (Q1)
-- Functional Query Definition
-- Approved February 1998
...
ERROR:  relation "lineitem" does not exist

```

## Summary of changes

We need dedicated GitHub secrets and dedicated connection strings for
each of the use cases.

## Test run
https://github.com/neondatabase/neon/actions/runs/13053968231
---
 .github/workflows/benchmarking.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 9446b4d17b..49f23e895b 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -741,10 +741,10 @@ jobs:
           neonvm-captest-reuse)
             case "${PG_VERSION}" in
               16)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
                 ;;
               17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }}
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -864,7 +864,7 @@ jobs:
                 CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
                 ;;
               17)
-                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17"
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17"
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -984,7 +984,7 @@ jobs:
                 CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
                 ;;
               17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }}
                 ;;
               *)
                 echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"

From 8293b252b2a7aaf82f0e108997483387ff2106be Mon Sep 17 00:00:00 2001
From: Cheng Chen <cheng@mooncakelabs.com>
Date: Thu, 30 Jan 2025 10:33:25 -0800
Subject: [PATCH 63/72] chore(compute): pg_mooncake v0.1.1 (#10578)

## Problem
Upgrade pg_mooncake to v0.1.1

## Summary of changes

https://github.com/Mooncake-Labs/pg_mooncake/blob/main/CHANGELOG.md#011-2025-01-29
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a428c61f34..e9f6c03768 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1140,8 +1140,8 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 
-RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
-    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
+    echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
     mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
     make release -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \

From bae0de643e75bc4e9154bc6e6e62368ba5c41d9d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 19:22:59 +0000
Subject: [PATCH 64/72] tests: relax constraints on
 test_timeline_archival_chaos (#10595)

## Problem

The test asserts that it completes at least 10 full timeline lifecycles,
but the noisy CI environment sometimes doesn't meet that goal.

Related: https://github.com/neondatabase/neon/issues/10389

## Summary of changes

- Sleep for longer between pageserver restarts, so that the timeline
workers have more chance to make progress
- Sleep for shorter between retries from timeline worker, so that they
have better chance to get in while a pageserver is up between restarts
- Relax the success condition to complete at least 5 iterations instead
of 10
---
 test_runner/regress/test_timeline_archive.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index bec8270582..306e971657 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -582,12 +582,12 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
                 # This is expected: we are injecting chaos, API calls will sometimes fail.
                 # TODO: can we narrow this to assert we are getting friendly 503s?
                 log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random())
+                shutdown.wait(random.random() * 0.5)
             except requests.exceptions.RetryError as e:
                 # Retryable error repeated more times than `requests` is configured to tolerate, this
                 # is expected when a pageserver remains unavailable for a couple seconds
                 log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random())
+                shutdown.wait(random.random() * 0.5)
             except Exception as e:
                 log.warning(
                     f"Unexpected worker exception (current timeline {state.timeline_id}): {e}"
@@ -632,7 +632,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
 
                 # Make sure we're up for as long as we spent restarting, to ensure operations can make progress
                 log.info(f"Staying alive for {restart_duration}s")
-                time.sleep(restart_duration)
+                time.sleep(restart_duration * 2)
             else:
                 # Migrate our tenant between pageservers
                 origin_ps = env.get_tenant_pageserver(tenant_shard_id)
@@ -651,7 +651,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
 
     # Sanity check that during our run we did exercise some full timeline lifecycles, in case
     # one of our workers got stuck
-    assert len(timelines_deleted) > 10
+    assert len(timelines_deleted) > 5
 
     # That no invariant-violations were reported by workers
     assert violations == []

From 4d2c2e946007444b8b4bbeb5348d20986174ee16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:23:25 +0100
Subject: [PATCH 65/72] Revert "storcon: switch to diesel-async and
 tokio-postgres (#10280)" (#10592)

There was a regression of #10280, tracked in
[#23583](https://github.com/neondatabase/cloud/issues/23583).

I have ideas how to fix the issue, but we are too close to the release
cutoff, so revert #10280 for now. We can revert the revert later :).
---
 .github/workflows/_build-and-test-locally.yml |   4 +
 .github/workflows/build-macos.yml             |   2 +-
 .github/workflows/neon_extra_builds.yml       |   2 +-
 Cargo.lock                                    | 159 ++--
 Dockerfile                                    |   2 +-
 Makefile                                      |   2 +
 storage_controller/Cargo.toml                 |   5 +-
 storage_controller/src/main.rs                |   2 +-
 storage_controller/src/persistence.rs         | 783 ++++++++----------
 9 files changed, 397 insertions(+), 564 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index f97402a90b..2daed90386 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,6 +158,8 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -215,6 +217,8 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml
index 347a511e98..01d82a1ed2 100644
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
           echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
 
       - name: Run cargo build (only for v17)
-        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
 
       - name: Check that no warnings are produced (only for v17)
         run: ./run_clippy.sh
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index f077e04d1c..5b5910badf 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/Cargo.lock b/Cargo.lock
index 9ba90355df..359f989a76 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -941,18 +941,6 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
-[[package]]
-name = "bb8"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
-dependencies = [
- "async-trait",
- "futures-util",
- "parking_lot 0.12.1",
- "tokio",
-]
-
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1312,7 +1300,7 @@ dependencies = [
  "tar",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-util",
  "tower 0.5.2",
@@ -1421,7 +1409,7 @@ dependencies = [
  "storage_broker",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-util",
  "toml",
  "toml_edit",
@@ -1797,24 +1785,11 @@ dependencies = [
  "chrono",
  "diesel_derives",
  "itoa",
+ "pq-sys",
+ "r2d2",
  "serde_json",
 ]
 
-[[package]]
-name = "diesel-async"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
-dependencies = [
- "async-trait",
- "bb8",
- "diesel",
- "futures-util",
- "scoped-futures",
- "tokio",
- "tokio-postgres 0.7.12",
-]
-
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4060,8 +4035,8 @@ dependencies = [
  "pageserver_compaction",
  "pin-project-lite",
  "postgres",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
+ "postgres-protocol",
+ "postgres-types",
  "postgres_backend",
  "postgres_connection",
  "postgres_ffi",
@@ -4092,7 +4067,7 @@ dependencies = [
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -4150,7 +4125,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-util",
  "utils",
@@ -4456,7 +4431,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 ]
 
 [[package]]
@@ -4477,24 +4452,6 @@ dependencies = [
  "stringprep",
 ]
 
-[[package]]
-name = "postgres-protocol"
-version = "0.6.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
-dependencies = [
- "base64 0.22.1",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "hmac",
- "md-5",
- "memchr",
- "rand 0.8.5",
- "sha2",
- "stringprep",
-]
-
 [[package]]
 name = "postgres-protocol2"
 version = "0.1.0"
@@ -4519,18 +4476,7 @@ dependencies = [
  "bytes",
  "chrono",
  "fallible-iterator",
- "postgres-protocol 0.6.6",
-]
-
-[[package]]
-name = "postgres-types"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
-dependencies = [
- "bytes",
- "fallible-iterator",
- "postgres-protocol 0.6.7",
+ "postgres-protocol",
 ]
 
 [[package]]
@@ -4555,7 +4501,7 @@ dependencies = [
  "serde",
  "thiserror 1.0.69",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.26.0",
  "tokio-util",
@@ -4570,7 +4516,7 @@ dependencies = [
  "itertools 0.10.5",
  "once_cell",
  "postgres",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "url",
 ]
 
@@ -4657,6 +4603,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "pq-sys"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4664,7 +4619,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
  "rand 0.8.5",
  "serde",
  "thiserror 1.0.69",
@@ -4912,7 +4867,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite 0.21.0",
@@ -4969,6 +4924,17 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r2d2"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
+dependencies = [
+ "log",
+ "parking_lot 0.12.1",
+ "scheduled-thread-pool",
+]
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5700,7 +5666,7 @@ dependencies = [
  "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
  "postgres_backend",
  "postgres_ffi",
  "pprof",
@@ -5724,7 +5690,7 @@ dependencies = [
  "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
@@ -5783,12 +5749,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "scoped-futures"
-version = "0.1.4"
+name = "scheduled-thread-pool"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
+checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
 dependencies = [
- "pin-project-lite",
+ "parking_lot 0.12.1",
 ]
 
 [[package]]
@@ -6323,7 +6289,6 @@ dependencies = [
  "clap",
  "control_plane",
  "diesel",
- "diesel-async",
  "diesel_migrations",
  "fail",
  "futures",
@@ -6338,10 +6303,10 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "postgres_connection",
+ "r2d2",
  "rand 0.8.5",
  "reqwest",
  "routerify",
- "scoped-futures",
  "scopeguard",
  "serde",
  "serde_json",
@@ -6394,7 +6359,7 @@ dependencies = [
  "serde_json",
  "storage_controller_client",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-stream",
  "tokio-util",
@@ -6873,34 +6838,8 @@ dependencies = [
  "percent-encoding",
  "phf",
  "pin-project-lite",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
- "rand 0.8.5",
- "socket2",
- "tokio",
- "tokio-util",
- "whoami",
-]
-
-[[package]]
-name = "tokio-postgres"
-version = "0.7.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
-dependencies = [
- "async-trait",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "futures-channel",
- "futures-util",
- "log",
- "parking_lot 0.12.1",
- "percent-encoding",
- "phf",
- "pin-project-lite",
- "postgres-protocol 0.6.7",
- "postgres-types 0.2.8",
+ "postgres-protocol",
+ "postgres-types",
  "rand 0.8.5",
  "socket2",
  "tokio",
@@ -6917,7 +6856,7 @@ dependencies = [
  "ring",
  "rustls 0.23.18",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
@@ -7576,6 +7515,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -7595,7 +7540,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
diff --git a/Dockerfile b/Dockerfile
index 7ba54c8ca5..f80666529b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
diff --git a/Makefile b/Makefile
index d1238caebf..22ebfea7d5 100644
--- a/Makefile
+++ b/Makefile
@@ -64,6 +64,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
+CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 
 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 9860bd5d0e..caaa22d0a5 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -45,11 +45,12 @@ strum_macros.workspace = true
 
 diesel = { version = "2.2.6", features = [
     "serde_json",
+    "postgres",
+    "r2d2",
     "chrono",
 ] }
-diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-scoped-futures = "0.1.4"
+r2d2 = { version = "0.8.10" }
 
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 659c088d51..801409d612 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
     // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    let persistence = Arc::new(Persistence::new(secrets.database_url).await);
+    let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
 
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 35eb15b297..37bfaf1139 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,12 +5,9 @@ use std::time::Duration;
 use std::time::Instant;
 
 use self::split_state::SplitState;
+use diesel::pg::PgConnection;
 use diesel::prelude::*;
-use diesel_async::async_connection_wrapper::AsyncConnectionWrapper;
-use diesel_async::pooled_connection::bb8::Pool;
-use diesel_async::pooled_connection::AsyncDieselConnectionManager;
-use diesel_async::RunQueryDsl;
-use diesel_async::{AsyncConnection, AsyncPgConnection};
+use diesel::Connection;
 use itertools::Itertools;
 use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
@@ -23,7 +20,6 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use scoped_futures::ScopedBoxFuture;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -64,7 +60,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: Pool<AsyncPgConnection>,
+    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
 }
 
 /// Legacy format, for use in JSON compat objects in test environment
@@ -80,7 +76,7 @@ pub(crate) enum DatabaseError {
     #[error(transparent)]
     Connection(#[from] diesel::result::ConnectionError),
     #[error(transparent)]
-    ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError),
+    ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
     #[error("Migration error: {0}")]
@@ -128,7 +124,6 @@ pub(crate) enum AbortShardSplitStatus {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 /// Some methods can operate on either a whole tenant or a single shard
-#[derive(Clone)]
 pub(crate) enum TenantFilter {
     Tenant(TenantId),
     Shard(TenantShardId),
@@ -141,11 +136,6 @@ pub(crate) struct ShardGenerationState {
     pub(crate) generation_pageserver: Option<NodeId>,
 }
 
-// A generous allowance for how many times we may retry serializable transactions
-// before giving up.  This is not expected to be hit: it is a defensive measure in case we
-// somehow engineer a situation where duelling transactions might otherwise live-lock.
-const MAX_RETRIES: usize = 128;
-
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -155,12 +145,12 @@ impl Persistence {
     const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
     const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
 
-    pub async fn new(database_url: String) -> Self {
-        let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new(database_url);
+    pub fn new(database_url: String) -> Self {
+        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
 
         // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
         // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = Pool::builder()
+        let connection_pool = diesel::r2d2::Pool::builder()
             .max_size(Self::MAX_CONNECTIONS)
             .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
             .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
@@ -168,7 +158,6 @@ impl Persistence {
             .min_idle(Some(1))
             .test_on_check_out(true)
             .build(manager)
-            .await
             .expect("Could not build connection pool");
 
         Self { connection_pool }
@@ -182,7 +171,7 @@ impl Persistence {
     ) -> Result<(), diesel::ConnectionError> {
         let started_at = Instant::now();
         loop {
-            match AsyncPgConnection::establish(database_url).await {
+            match PgConnection::establish(database_url) {
                 Ok(_) => {
                     tracing::info!("Connected to database.");
                     return Ok(());
@@ -203,22 +192,57 @@ impl Persistence {
     pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
         use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 
-        // Can't use self.with_conn here as we do spawn_blocking which requires static.
-        let conn = self
-            .connection_pool
-            .dedicated_connection()
-            .await
-            .map_err(|e| DatabaseError::Migration(e.to_string()))?;
-        let mut async_wrapper: AsyncConnectionWrapper<AsyncPgConnection> =
-            AsyncConnectionWrapper::from(conn);
-        tokio::task::spawn_blocking(move || {
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            HarnessWithOutput::write_to_stdout(conn)
+                .run_pending_migrations(MIGRATIONS)
+                .map(|_| ())
+                .map_err(|e| DatabaseError::Migration(e.to_string()))
+        })
+        .await
+    }
+
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        // A generous allowance for how many times we may retry serializable transactions
+        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
+        // somehow engineer a situation where duelling transactions might otherwise live-lock.
+        const MAX_RETRIES: usize = 128;
+
+        let mut conn = self.connection_pool.get()?;
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
             let mut retry_count = 0;
             loop {
-                let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper)
-                    .run_pending_migrations(MIGRATIONS)
-                    .map(|_| ())
-                    .map_err(|e| DatabaseError::Migration(e.to_string()));
-                match result {
+                match conn.build_transaction().serializable().run(|c| func(c)) {
                     Ok(r) => break Ok(r),
                     Err(
                         err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
@@ -247,112 +271,33 @@ impl Persistence {
             }
         })
         .await
-        .map_err(|e| DatabaseError::Migration(e.to_string()))??;
-        Ok(())
-    }
-
-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<'a, 'b, F, R>(
-        &self,
-        op: DatabaseOperation,
-        func: F,
-    ) -> DatabaseResult<R>
-    where
-        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
-            + Send
-            + std::marker::Sync
-            + 'a,
-        R: Send + 'b,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
-    /// Call the provided function with a Diesel database connection in a retry loop
-    async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult<R>
-    where
-        F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult<R>>
-            + Send
-            + std::marker::Sync
-            + 'a,
-        R: Send + 'b,
-    {
-        let mut retry_count = 0;
-        loop {
-            let mut conn = self.connection_pool.get().await?;
-            match conn
-                .build_transaction()
-                .serializable()
-                .run(|c| func(c))
-                .await
-            {
-                Ok(r) => break Ok(r),
-                Err(
-                    err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
-                        diesel::result::DatabaseErrorKind::SerializationFailure,
-                        _,
-                    )),
-                ) => {
-                    retry_count += 1;
-                    if retry_count > MAX_RETRIES {
-                        tracing::error!(
-                            "Exceeded max retries on SerializationFailure errors: {err:?}"
-                        );
-                        break Err(err);
-                    } else {
-                        // Retry on serialization errors: these are expected, because even though our
-                        // transactions don't fight for the same rows, they will occasionally collide
-                        // on index pages (e.g. increment_generation for unrelated shards can collide)
-                        tracing::debug!("Retrying transaction on serialization failure {err:?}");
-                        continue;
-                    }
-                }
-                Err(e) => break Err(e),
-            }
-        }
+        .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = &node.to_persistent();
-        self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| {
-            Box::pin(async move {
+        let np = node.to_persistent();
+        self.with_measured_conn(
+            DatabaseOperation::InsertNode,
+            move |conn| -> DatabaseResult<()> {
                 diesel::insert_into(crate::schema::nodes::table)
-                    .values(np)
-                    .execute(conn)
-                    .await?;
+                    .values(&np)
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::nodes::table
-                        .load::<NodePersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -368,14 +313,11 @@ impl Persistence {
         use crate::schema::nodes::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
-                Box::pin(async move {
-                    let updated = diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .set((scheduling_policy.eq(String::from(input_scheduling)),))
-                        .execute(conn)
-                        .await?;
-                    Ok(updated)
-                })
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                    .execute(conn)?;
+                Ok(updated)
             })
             .await?;
 
@@ -397,16 +339,17 @@ impl Persistence {
         &self,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::ListTenantShards,
+            move |conn| -> DatabaseResult<_> {
                 let query = tenant_shards.filter(
                     placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
                 );
-                let result = query.load::<TenantShardPersistence>(conn).await?;
+                let result = query.load::<TenantShardPersistence>(conn)?;
 
                 Ok(result)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -416,14 +359,15 @@ impl Persistence {
         filter_tenant_id: TenantId,
     ) -> DatabaseResult<Vec<TenantShardPersistence>> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::LoadTenant,
+            move |conn| -> DatabaseResult<_> {
                 let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string()));
-                let result = query.load::<TenantShardPersistence>(conn).await?;
+                let result = query.load::<TenantShardPersistence>(conn)?;
 
                 Ok(result)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -449,22 +393,19 @@ impl Persistence {
             })
             .collect::<Vec<_>>();
 
-        let shards = &shards;
-        let metadata_health_records = &metadata_health_records;
-        self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::InsertTenantShards,
+            move |conn| -> DatabaseResult<()> {
                 diesel::insert_into(tenant_shards::table)
-                    .values(shards)
-                    .execute(conn)
-                    .await?;
+                    .values(&shards)
+                    .execute(conn)?;
 
                 diesel::insert_into(metadata_health::table)
-                    .values(metadata_health_records)
-                    .execute(conn)
-                    .await?;
+                    .values(&metadata_health_records)
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -472,31 +413,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::DeleteTenant,
+            move |conn| -> DatabaseResult<()> {
                 // `metadata_health` status (if exists) is also deleted based on the cascade behavior.
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::DeleteNode,
+            move |conn| -> DatabaseResult<()> {
                 diesel::delete(nodes)
                     .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -513,41 +454,34 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
-                Box::pin(async move {
-                    let rows_updated = diesel::update(tenant_shards)
-                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                        .set(generation.eq(generation + 1))
-                        .execute(conn)
-                        .await?;
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
 
-                    tracing::info!("Incremented {} tenants' generations", rows_updated);
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
 
-                    // TODO: UPDATE+SELECT in one query
+                // TODO: UPDATE+SELECT in one query
 
-                    let updated = tenant_shards
-                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
-                        .select(TenantShardPersistence::as_select())
-                        .load(conn)
-                        .await?;
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
 
-                    // If the node went through a drain and restart phase before re-attaching,
-                    // then reset it's node scheduling policy to active.
-                    diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .filter(
-                            scheduling_policy
-                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
-                        )
-                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                        .execute(conn)
-                        .await?;
+                // If the node went through a drain and restart phase before re-attaching,
+                // then reset it's node scheduling policy to active.
+                diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .filter(
+                        scheduling_policy
+                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
+                    )
+                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                    .execute(conn)?;
 
-                    Ok(updated)
-                })
+                Ok(updated)
             })
             .await?;
 
@@ -584,22 +518,19 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
             .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
-                Box::pin(async move {
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .set((
-                            generation.eq(generation + 1),
-                            generation_pageserver.eq(node_id.0 as i64),
-                        ))
-                        // TODO: only returning() the generation column
-                        .returning(TenantShardPersistence::as_returning())
-                        .get_result(conn)
-                        .await?;
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;
 
-                    Ok(updated)
-                })
+                Ok(updated)
             })
             .await?;
 
@@ -631,15 +562,12 @@ impl Persistence {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
-                Box::pin(async move {
-                    let result = tenant_shards
-                        .filter(tenant_id.eq(filter_tenant_id.to_string()))
-                        .select(TenantShardPersistence::as_select())
-                        .order(shard_number)
-                        .load(conn)
-                        .await?;
-                    Ok(result)
-                })
+                let result = tenant_shards
+                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                    .select(TenantShardPersistence::as_select())
+                    .order(shard_number)
+                    .load(conn)?;
+                Ok(result)
             })
             .await?;
 
@@ -687,18 +615,15 @@ impl Persistence {
                 break;
             }
 
-            let in_clause = &in_clause;
             let chunk_rows = self
                 .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
-                    Box::pin(async move {
-                        // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
-                        // the inputs are strongly typed and cannot carry any user-supplied raw string content.
-                        let result : Vec<TenantShardPersistence> = diesel::sql_query(
-                            format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
-                        ).load(conn).await?;
+                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                    ).load(conn)?;
 
-                        Ok(result)
-                    })
+                    Ok(result)
                 })
                 .await?;
             rows.extend(chunk_rows.into_iter())
@@ -732,58 +657,51 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
-        let tenant = &tenant;
-        let input_placement_policy = &input_placement_policy;
-        let input_config = &input_config;
-        let input_generation = &input_generation;
-        let input_scheduling_policy = &input_scheduling_policy;
         self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            Box::pin(async move {
-                let query = match tenant {
-                    TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .into_boxed(),
-                    TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(input_tenant_id.to_string()))
-                        .into_boxed(),
-                };
+            let query = match tenant {
+                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .into_boxed(),
+                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(input_tenant_id.to_string()))
+                    .into_boxed(),
+            };
 
-                // Clear generation_pageserver if we are moving into a state where we won't have
-                // any attached pageservers.
-                let input_generation_pageserver = match input_placement_policy {
-                    None | Some(PlacementPolicy::Attached(_)) => None,
-                    Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
-                };
+            // Clear generation_pageserver if we are moving into a state where we won't have
+            // any attached pageservers.
+            let input_generation_pageserver = match input_placement_policy {
+                None | Some(PlacementPolicy::Attached(_)) => None,
+                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+            };
 
-                #[derive(AsChangeset)]
-                #[diesel(table_name = crate::schema::tenant_shards)]
-                struct ShardUpdate {
-                    generation: Option<i32>,
-                    placement_policy: Option<String>,
-                    config: Option<String>,
-                    scheduling_policy: Option<String>,
-                    generation_pageserver: Option<Option<i64>>,
-                }
+            #[derive(AsChangeset)]
+            #[diesel(table_name = crate::schema::tenant_shards)]
+            struct ShardUpdate {
+                generation: Option<i32>,
+                placement_policy: Option<String>,
+                config: Option<String>,
+                scheduling_policy: Option<String>,
+                generation_pageserver: Option<Option<i64>>,
+            }
 
-                let update = ShardUpdate {
-                    generation: input_generation.map(|g| g.into().unwrap() as i32),
-                    placement_policy: input_placement_policy
-                        .as_ref()
-                        .map(|p| serde_json::to_string(&p).unwrap()),
-                    config: input_config
-                        .as_ref()
-                        .map(|c| serde_json::to_string(&c).unwrap()),
-                    scheduling_policy: input_scheduling_policy
-                        .map(|p| serde_json::to_string(&p).unwrap()),
-                    generation_pageserver: input_generation_pageserver,
-                };
+            let update = ShardUpdate {
+                generation: input_generation.map(|g| g.into().unwrap() as i32),
+                placement_policy: input_placement_policy
+                    .as_ref()
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                config: input_config
+                    .as_ref()
+                    .map(|c| serde_json::to_string(&c).unwrap()),
+                scheduling_policy: input_scheduling_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                generation_pageserver: input_generation_pageserver,
+            };
 
-                query.set(update).execute(conn).await?;
+            query.set(update).execute(conn)?;
 
-                Ok(())
-            })
+            Ok(())
         })
         .await?;
 
@@ -797,27 +715,23 @@ impl Persistence {
     ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
-        let preferred_azs = preferred_azs.as_slice();
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
-            Box::pin(async move {
-                let mut shards_updated = Vec::default();
+            let mut shards_updated = Vec::default();
 
-                for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                        .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
-                        .execute(conn)
-                        .await?;
+            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
+                    .execute(conn)?;
 
-                    if updated == 1 {
-                        shards_updated.push((*tenant_shard_id, preferred_az.clone()));
-                    }
+                if updated == 1 {
+                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
                 }
+            }
 
-                Ok(shards_updated)
-            })
+            Ok(shards_updated)
         })
         .await
     }
@@ -825,21 +739,17 @@ impl Persistence {
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
-            Box::pin(async move {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set((
-                        generation_pageserver.eq(Option::<i64>::None),
-                        placement_policy
-                            .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                    ))
-                    .execute(conn)
-                    .await?;
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                .set((
+                    generation_pageserver.eq(Option::<i64>::None),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;
 
-                Ok(updated)
-            })
+            Ok(updated)
         })
         .await?;
 
@@ -858,16 +768,14 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        let parent_to_children = parent_to_children.as_slice();
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
             // Mark parent shards as splitting
 
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(split_tenant_id.to_string()))
                 .filter(shard_count.eq(old_shard_count.literal() as i32))
                 .set((splitting.eq(1),))
-                .execute(conn).await?;
+                .execute(conn)?;
             if u8::try_from(updated)
                 .map_err(|_| DatabaseError::Logical(
                     format!("Overflow existing shard count {} while splitting", updated))
@@ -880,7 +788,7 @@ impl Persistence {
             }
 
             // FIXME: spurious clone to sidestep closure move rules
-            let parent_to_children = parent_to_children.to_vec();
+            let parent_to_children = parent_to_children.clone();
 
             // Insert child shards
             for (parent_shard_id, children) in parent_to_children {
@@ -888,7 +796,7 @@ impl Persistence {
                     .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                    .load::<TenantShardPersistence>(conn).await?;
+                    .load::<TenantShardPersistence>(conn)?;
                 let parent = if parent.len() != 1 {
                     return Err(DatabaseError::Logical(format!(
                         "Parent shard {parent_shard_id} not found"
@@ -903,13 +811,12 @@ impl Persistence {
                     debug_assert!(shard.splitting == SplitState::Splitting);
                     diesel::insert_into(tenant_shards)
                         .values(shard)
-                        .execute(conn).await?;
+                        .execute(conn)?;
                 }
             }
 
             Ok(())
         })
-        })
         .await
     }
 
@@ -921,26 +828,25 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::CompleteShardSplit,
+            move |conn| -> DatabaseResult<()> {
                 // Drop parent shards
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 // Clear sharding flag
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .set((splitting.eq(0),))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 debug_assert!(updated > 0);
 
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -952,15 +858,15 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::AbortShardSplit,
+            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
                 // Clear the splitting state on parent shards
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.ne(new_shard_count.literal() as i32))
                     .set((splitting.eq(0),))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 // Parent shards are already gone: we cannot abort.
                 if updated == 0 {
@@ -980,12 +886,11 @@ impl Persistence {
                 diesel::delete(tenant_shards)
                     .filter(tenant_id.eq(split_tenant_id.to_string()))
                     .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 Ok(AbortShardSplitStatus::Aborted)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1001,28 +906,25 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::metadata_health::dsl::*;
 
-        let healthy_records = healthy_records.as_slice();
-        let unhealthy_records = unhealthy_records.as_slice();
-        self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::UpdateMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
                 diesel::insert_into(metadata_health)
-                    .values(healthy_records)
+                    .values(&healthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(true), last_scrubbed_at.eq(now)))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
 
                 diesel::insert_into(metadata_health)
-                    .values(unhealthy_records)
+                    .values(&unhealthy_records)
                     .on_conflict((tenant_id, shard_number, shard_count))
                     .do_update()
                     .set((healthy.eq(false), last_scrubbed_at.eq(now)))
-                    .execute(conn)
-                    .await?;
+                    .execute(conn)?;
                 Ok(())
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1031,13 +933,15 @@ impl Persistence {
     pub(crate) async fn list_metadata_health_records(
         &self,
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
-        self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| {
-            Box::pin(async {
-                Ok(crate::schema::metadata_health::table
-                    .load::<MetadataHealthPersistence>(conn)
-                    .await?)
-            })
-        })
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealth,
+            move |conn| -> DatabaseResult<_> {
+                Ok(
+                    crate::schema::metadata_health::table
+                        .load::<MetadataHealthPersistence>(conn)?,
+                )
+            },
+        )
         .await
     }
 
@@ -1049,15 +953,10 @@ impl Persistence {
         use crate::schema::metadata_health::dsl::*;
         self.with_measured_conn(
             DatabaseOperation::ListMetadataHealthUnhealthy,
-            move |conn| {
-                Box::pin(async {
-                    DatabaseResult::Ok(
-                        crate::schema::metadata_health::table
-                            .filter(healthy.eq(false))
-                            .load::<MetadataHealthPersistence>(conn)
-                            .await?,
-                    )
-                })
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::metadata_health::table
+                    .filter(healthy.eq(false))
+                    .load::<MetadataHealthPersistence>(conn)?)
             },
         )
         .await
@@ -1071,14 +970,15 @@ impl Persistence {
     ) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
         use crate::schema::metadata_health::dsl::*;
 
-        self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| {
-            Box::pin(async move {
+        self.with_measured_conn(
+            DatabaseOperation::ListMetadataHealthOutdated,
+            move |conn| -> DatabaseResult<_> {
                 let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
-                let res = query.load::<MetadataHealthPersistence>(conn).await?;
+                let res = query.load::<MetadataHealthPersistence>(conn)?;
 
                 Ok(res)
-            })
-        })
+            },
+        )
         .await
     }
 
@@ -1086,13 +986,12 @@ impl Persistence {
     /// It is an error for the table to contain more than one entry.
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         let mut leader: Vec<ControllerPersistence> = self
-            .with_measured_conn(DatabaseOperation::GetLeader, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::controllers::table
-                        .load::<ControllerPersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::GetLeader,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::controllers::table.load::<ControllerPersistence>(conn)?)
+                },
+            )
             .await?;
 
         if leader.len() > 1 {
@@ -1115,33 +1014,26 @@ impl Persistence {
         use crate::schema::controllers::dsl::*;
 
         let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| {
-                let prev = prev.clone();
-                let new = new.clone();
-                Box::pin(async move {
+            .with_measured_conn(
+                DatabaseOperation::UpdateLeader,
+                move |conn| -> DatabaseResult<usize> {
                     let updated = match &prev {
-                        Some(prev) => {
-                            diesel::update(controllers)
-                                .filter(address.eq(prev.address.clone()))
-                                .filter(started_at.eq(prev.started_at))
-                                .set((
-                                    address.eq(new.address.clone()),
-                                    started_at.eq(new.started_at),
-                                ))
-                                .execute(conn)
-                                .await?
-                        }
-                        None => {
-                            diesel::insert_into(controllers)
-                                .values(new.clone())
-                                .execute(conn)
-                                .await?
-                        }
+                        Some(prev) => diesel::update(controllers)
+                            .filter(address.eq(prev.address.clone()))
+                            .filter(started_at.eq(prev.started_at))
+                            .set((
+                                address.eq(new.address.clone()),
+                                started_at.eq(new.started_at),
+                            ))
+                            .execute(conn)?,
+                        None => diesel::insert_into(controllers)
+                            .values(new.clone())
+                            .execute(conn)?,
                     };
 
                     Ok(updated)
-                })
-            })
+                },
+            )
             .await?;
 
         if updated == 0 {
@@ -1156,13 +1048,12 @@ impl Persistence {
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
         let safekeepers: Vec<SafekeeperPersistence> = self
-            .with_measured_conn(DatabaseOperation::ListNodes, move |conn| {
-                Box::pin(async move {
-                    Ok(crate::schema::safekeepers::table
-                        .load::<SafekeeperPersistence>(conn)
-                        .await?)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
@@ -1175,14 +1066,11 @@ impl Persistence {
         id: i64,
     ) -> Result<SafekeeperPersistence, DatabaseError> {
         use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
-        self.with_conn(move |conn| {
-            Box::pin(async move {
-                Ok(safekeepers
-                    .filter(id_column.eq(&id))
-                    .select(SafekeeperPersistence::as_select())
-                    .get_result(conn)
-                    .await?)
-            })
+        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
+            Ok(safekeepers
+                .filter(id_column.eq(&id))
+                .select(SafekeeperPersistence::as_select())
+                .get_result(conn)?)
         })
         .await
     }
@@ -1193,30 +1081,26 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| {
-            let record = record.clone();
-            Box::pin(async move {
-                let bind = record
-                    .as_insert_or_update()
-                    .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            let bind = record
+                .as_insert_or_update()
+                .map_err(|e| DatabaseError::Logical(format!("{e}")))?;
 
-                let inserted_updated = diesel::insert_into(safekeepers)
-                    .values(&bind)
-                    .on_conflict(id)
-                    .do_update()
-                    .set(&bind)
-                    .execute(conn)
-                    .await?;
+            let inserted_updated = diesel::insert_into(safekeepers)
+                .values(&bind)
+                .on_conflict(id)
+                .do_update()
+                .set(&bind)
+                .execute(conn)?;
 
-                if inserted_updated != 1 {
-                    return Err(DatabaseError::Logical(format!(
-                        "unexpected number of rows ({})",
-                        inserted_updated
-                    )));
-                }
+            if inserted_updated != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({})",
+                    inserted_updated
+                )));
+            }
 
-                Ok(())
-            })
+            Ok(())
         })
         .await
     }
@@ -1228,29 +1112,26 @@ impl Persistence {
     ) -> Result<(), DatabaseError> {
         use crate::schema::safekeepers::dsl::*;
 
-        self.with_conn(move |conn| {
-            Box::pin(async move {
-                #[derive(Insertable, AsChangeset)]
-                #[diesel(table_name = crate::schema::safekeepers)]
-                struct UpdateSkSchedulingPolicy<'a> {
-                    id: i64,
-                    scheduling_policy: &'a str,
-                }
-                let scheduling_policy_ = String::from(scheduling_policy_);
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            #[derive(Insertable, AsChangeset)]
+            #[diesel(table_name = crate::schema::safekeepers)]
+            struct UpdateSkSchedulingPolicy<'a> {
+                id: i64,
+                scheduling_policy: &'a str,
+            }
+            let scheduling_policy_ = String::from(scheduling_policy_);
 
-                let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
-                    .set(scheduling_policy.eq(scheduling_policy_))
-                    .execute(conn)
-                    .await?;
+            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                .set(scheduling_policy.eq(scheduling_policy_))
+                .execute(conn)?;
 
-                if rows_affected != 1 {
-                    return Err(DatabaseError::Logical(format!(
-                        "unexpected number of rows ({rows_affected})",
-                    )));
-                }
+            if rows_affected != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({rows_affected})",
+                )));
+            }
 
-                Ok(())
-            })
+            Ok(())
         })
         .await
     }

From bf6d5e93baa176f1f0015833beb108f9995a5142 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 30 Jan 2025 20:32:35 +0100
Subject: [PATCH 66/72] Run tests of the contrib extensions (#10392)

## Problem
We don't test the extensions, shipped with contrib
## Summary of changes
The tests are now running
---
 compute/compute-node.Dockerfile           |   1 +
 compute/patches/contrib_pg16.patch        | 242 ++++++++++++++++++++++
 compute/patches/contrib_pg17.patch        | 196 ++++++++++++++++++
 docker-compose/compute_wrapper/Dockerfile |   2 +-
 docker-compose/docker_compose_test.sh     |  35 +++-
 docker-compose/run-tests.sh               |   6 +-
 6 files changed, 469 insertions(+), 13 deletions(-)
 create mode 100644 compute/patches/contrib_pg16.patch
 create mode 100644 compute/patches/contrib_pg17.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index e9f6c03768..1ef449f0b0 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1345,6 +1345,7 @@ FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
 RUN mkdir /ext-src
 
+COPY --from=pg-build /postgres /postgres
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
diff --git a/compute/patches/contrib_pg16.patch b/compute/patches/contrib_pg16.patch
new file mode 100644
index 0000000000..71adaabe7d
--- /dev/null
+++ b/compute/patches/contrib_pg16.patch
@@ -0,0 +1,242 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
++++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
++++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
++++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
++++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 72304e0..ebe131b 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
++++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
++++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
+index d1adbab..38b52ac 100644
+--- a/contrib/pageinspect/expected/gist.out
++++ b/contrib/pageinspect/expected/gist.out
+@@ -10,25 +10,6 @@ BEGIN;
+ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+- lsn | nsn | rightlink  | flags 
+------+-----+------------+-------
+- 0/1 | 0/0 | 4294967295 | {}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+- lsn | nsn | rightlink  | flags  
+------+-----+------------+--------
+- 0/1 | 0/0 | 4294967295 | {leaf}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+- lsn | nsn | rightlink | flags  
+------+-----+-----------+--------
+- 0/1 | 0/0 |         1 | {leaf}
+-(1 row)
+-
+ COMMIT;
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
+  itemoffset |   ctid    | itemlen | dead |             keys              
+diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
+index d263542..607992f 100644
+--- a/contrib/pageinspect/sql/gist.sql
++++ b/contrib/pageinspect/sql/gist.sql
+@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+ 
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+-
+ COMMIT;
+ 
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
diff --git a/compute/patches/contrib_pg17.patch b/compute/patches/contrib_pg17.patch
new file mode 100644
index 0000000000..0d6c1203b0
--- /dev/null
+++ b/compute/patches/contrib_pg17.patch
@@ -0,0 +1,196 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
++++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
++++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
++++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
++++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 86c148a..81bdb2c 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
++++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
++++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
++SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 61f44681da..b5f0f47ceb 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
                        jq   \
                        netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src
+RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
 
 USER postgres
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index a05d6c043d..e0c537edf3 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -61,17 +61,32 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
         docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
         rm -rf $TMPDIR
+        # The following block does the same for the contrib/file_fdw test
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data
+        rm -rf $TMPDIR
+        # Apply patches
+        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
         # We are running tests now
-        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
-            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
-        then
-            FAILED=$(tail -1 testout.txt)
-            for d in $FAILED
-            do
-                mkdir $d
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
-                cat $d/regression.out $d/regression.diffs || true
+        rm -f testout.txt testout_contrib.txt
+        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
+        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
+        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
+        if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then
+            CONTRIB_FAILED=
+            FAILED=
+            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
+            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
+            for d in $FAILED $CONTRIB_FAILED; do
+                dn="$(basename $d)"
+                rm -rf $dn
+                mkdir $dn
+                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
+                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
+                cat $dn/regression.out $dn/regression.diffs || true
+                rm -rf $dn
             done
         rm -rf $FAILED
         exit 1
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 1e794a42a1..72ae61b032 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 set -x
 
-cd /ext-src || exit 2
+extdir=${1}
+
+cd "${extdir}" || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
 for d in ${LIST}; do
     [ -d "${d}" ] || continue
     if ! psql -w -c "select 1" >/dev/null; then

From 6da7c556c2f3725d51ea2c626491a8011be97f04 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 20:33:22 +0000
Subject: [PATCH 67/72] pageserver: fix race cleaning up timeline files when
 shut down during bootstrap (#10532)

## Problem

Timeline bootstrap starts a flush loop, but doesn't reliably shut down
the timeline (incl. waiting for flush loop to exit) before destroying
UninitializedTimeline, and that destructor tries to clean up local
storage. If local storage is still being written to, then this is
unsound.

Currently the symptom is that we see a "Directory not empty" error log,
e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/main/12966756686/index.html#testresult/5523f7d15f46f7f7/retries

## Summary of changes

- Move fallible IO part of bootstrap into a function (notably, this is
fallible in the case of the tenant being shut down while creation is
happening)
- When that function returns an error, call shutdown() on the timeline
---
 pageserver/src/http/routes.rs            |   8 +-
 pageserver/src/tenant.rs                 |  68 ++++++------
 pageserver/src/tenant/timeline/uninit.rs | 129 +++++++++++++++++------
 3 files changed, 131 insertions(+), 74 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0f3e9fdab6..2548a11b2e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3169,12 +3169,16 @@ async fn put_tenant_timeline_import_basebackup(
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let span = info_span!("import_basebackup",
+        tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(),
+        base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
     async move {
         let state = get_state(&request);
         let tenant = state
             .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
 
         let broker_client = state.broker_client.clone();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 085f76c05d..657cc78e2c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2426,7 +2426,7 @@ impl Tenant {
         // Make sure the freeze_and_flush reaches remote storage.
         tline.remote_client.wait_completion().await.unwrap();
 
-        let tl = uninit_tl.finish_creation()?;
+        let tl = uninit_tl.finish_creation().await?;
         // The non-test code would call tl.activate() here.
         tl.set_state(TimelineState::Active);
         Ok(tl)
@@ -4702,7 +4702,7 @@ impl Tenant {
             )
             .await?;
 
-        let new_timeline = uninitialized_timeline.finish_creation()?;
+        let new_timeline = uninitialized_timeline.finish_creation().await?;
 
         // Root timeline gets its layers during creation and uploads them along with the metadata.
         // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -4892,10 +4892,11 @@ impl Tenant {
         }
 
         // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
+        let pgdata_path_deferred = pgdata_path.clone();
         scopeguard::defer! {
-            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
+            if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) {
                 // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
-                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
+                error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}");
             }
         }
         if let Some(existing_initdb_timeline_id) = load_existing_initdb {
@@ -4962,7 +4963,7 @@ impl Tenant {
             pgdata_lsn,
             pg_version,
         );
-        let raw_timeline = self
+        let mut raw_timeline = self
             .prepare_new_timeline(
                 timeline_id,
                 &new_metadata,
@@ -4973,42 +4974,33 @@ impl Tenant {
             .await?;
 
         let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
-        let unfinished_timeline = raw_timeline.raw_timeline()?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
-        import_datadir::import_timeline_from_postgres_datadir(
-            unfinished_timeline,
-            &pgdata_path,
-            pgdata_lsn,
-            ctx,
-        )
-        .await
-        .with_context(|| {
-            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
-        })?;
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "failpoint before-checkpoint-new-timeline"
-            )))
-        });
-
-        unfinished_timeline
-            .freeze_and_flush()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
+        raw_timeline
+            .write(|unfinished_timeline| async move {
+                import_datadir::import_timeline_from_postgres_datadir(
+                    &unfinished_timeline,
+                    &pgdata_path,
+                    pgdata_lsn,
+                    ctx,
                 )
-            })?;
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}"
+                    )
+                })?;
+
+                fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                    Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        "failpoint before-checkpoint-new-timeline"
+                    )))
+                });
+
+                Ok(())
+            })
+            .await?;
 
         // All done!
-        let timeline = raw_timeline.finish_creation()?;
+        let timeline = raw_timeline.finish_creation().await?;
 
         // Callers are responsible to wait for uploads to complete and for activating the timeline.
 
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 80a09b4840..3074463384 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,4 +1,4 @@
-use std::{collections::hash_map::Entry, fs, sync::Arc};
+use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc};
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -8,7 +8,8 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
 use crate::{
     context::RequestContext,
     import_datadir,
-    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+    span::debug_assert_current_span_has_tenant_and_timeline_id,
+    tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
 };
 
 use super::Timeline;
@@ -24,6 +25,9 @@ pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
     raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
+    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
+    /// if aborting the timeline creation
+    needs_shutdown: bool,
 }
 
 impl<'t> UninitializedTimeline<'t> {
@@ -36,6 +40,50 @@ impl<'t> UninitializedTimeline<'t> {
             owning_tenant,
             timeline_id,
             raw_timeline,
+            needs_shutdown: false,
+        }
+    }
+
+    /// When writing data to this timeline during creation, use this wrapper: it will take care of
+    /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down
+    /// later.
+    pub(crate) async fn write<F, Fut>(&mut self, f: F) -> anyhow::Result<()>
+    where
+        F: FnOnce(Arc<Timeline>) -> Fut,
+        Fut: Future<Output = Result<(), CreateTimelineError>>,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop
+        self.needs_shutdown = true;
+
+        let timeline = self.raw_timeline()?;
+
+        // Spawn flush loop so that the Timeline is ready to accept writes
+        timeline.maybe_spawn_flush_loop();
+
+        // Invoke the provided function, which will write some data into the new timeline
+        if let Err(e) = f(timeline.clone()).await {
+            self.abort().await;
+            return Err(e.into());
+        }
+
+        // Flush the underlying timeline's ephemeral layers to disk
+        if let Err(e) = timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after timeline creation writes")
+        {
+            self.abort().await;
+            return Err(e);
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn abort(&self) {
+        if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() {
+            raw_timeline.shutdown(super::ShutdownMode::Hard).await;
         }
     }
 
@@ -44,11 +92,13 @@ impl<'t> UninitializedTimeline<'t> {
     /// This function launches the flush loop if not already done.
     ///
     /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+    pub(crate) async fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_shard_id = self.owning_tenant.tenant_shard_id;
 
         if self.raw_timeline.is_none() {
+            self.abort().await;
+
             return Err(anyhow::anyhow!(
                 "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
             ));
@@ -62,16 +112,25 @@ impl<'t> UninitializedTimeline<'t> {
             .0
             .get_disk_consistent_lsn();
 
-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
+        if !new_disk_consistent_lsn.is_valid() {
+            self.abort().await;
+
+            return Err(anyhow::anyhow!(
+                "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
+            ));
+        }
 
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
+            Entry::Occupied(_) => {
+                // Unexpected, bug in the caller.  Tenant is responsible for preventing concurrent creation of the same timeline.
+                //
+                // We do not call Self::abort here.  Because we don't cleanly shut down our Timeline, [`Self::drop`] should
+                // skip trying to delete the timeline directory too.
+                anyhow::bail!(
                 "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
-            ),
+                )
+            }
             Entry::Vacant(v) => {
                 // after taking here should be no fallible operations, because the drop guard will not
                 // cleanup after and would block for example the tenant deletion
@@ -93,36 +152,31 @@ impl<'t> UninitializedTimeline<'t> {
 
     /// Prepares timeline data by loading it from the basebackup archive.
     pub(crate) async fn import_basebackup_from_tar(
-        self,
+        mut self,
         tenant: Arc<Tenant>,
         copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
         broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
+        self.write(|raw_timeline| async move {
+            import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx)
+                .await
+                .context("Failed to import basebackup")
+                .map_err(CreateTimelineError::Other)?;
 
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
+            fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "failpoint before-checkpoint-new-timeline"
+                )))
+            });
 
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
+            Ok(())
+        })
+        .await?;
 
         // All the data has been imported. Insert the Timeline into the tenant's timelines map
-        let tl = self.finish_creation()?;
+        let tl = self.finish_creation().await?;
         tl.activate(tenant, broker_client, None, ctx);
         Ok(tl)
     }
@@ -143,12 +197,19 @@ impl<'t> UninitializedTimeline<'t> {
 
 impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
-        if let Some((_, create_guard)) = self.raw_timeline.take() {
+        if let Some((timeline, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            // This is unusual, but can happen harmlessly if the pageserver is stopped while
-            // creating a timeline.
-            info!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(create_guard);
+            if self.needs_shutdown && !timeline.gate.close_complete() {
+                // This should not happen: caller should call [`Self::abort`] on failures
+                tracing::warn!(
+                    "Timeline not shut down after initialization failure, cannot clean up files"
+                );
+            } else {
+                // This is unusual, but can happen harmlessly if the pageserver is stopped while
+                // creating a timeline.
+                info!("Timeline got dropped without initializing, cleaning its files");
+                cleanup_timeline_directory(create_guard);
+            }
         }
     }
 }

From d18f6198e15d5201db3c3fa5e3d475ad9de334fb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:17:07 +0000
Subject: [PATCH 68/72] storcon: fix AZ-driven tenant selection in chaos
 (#10443)

## Problem

In https://github.com/neondatabase/neon/pull/10438 I had got the
function for picking tenants backwards, and it was preferring to move
things _away_ from their preferred AZ.

## Summary of changes

- Fix condition in `is_attached_outside_preferred_az`
---
 storage_controller/src/tenant_shard.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index cbc2696b26..d344e27e31 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1806,7 +1806,7 @@ impl TenantShard {
                         .get(&node_id)
                         .expect("referenced node exists")
                         .get_availability_zone_id(),
-                ) == self.intent.preferred_az_id.as_ref()
+                ) != self.intent.preferred_az_id.as_ref()
             })
             .unwrap_or(false)
     }

From e1273acdb1e018352d97ea11bf65adf90db69c78 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:43:36 +0000
Subject: [PATCH 69/72] pageserver: handle shutdown cleanly in layer download
 API (#10598)

## Problem

This API is used in tests and occasionally for support. It cast all
errors to 500.

That can cause a failure on the log checks:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13056992876/index.html#suites/ad9c266207b45eafe19909d1020dd987/683a7031d877f3db/

## Summary of changes

- Avoid using generic anyhow::Error for layer downloads
- Map shutdown cases to 503 in http route
---
 pageserver/src/http/routes.rs                 |  8 +++++++-
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 pageserver/src/tenant/timeline.rs             | 12 ++++++++++--
 test_runner/regress/test_ondemand_download.py | 19 +++++++------------
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2548a11b2e..eb9cb4da0c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1472,7 +1472,13 @@ async fn layer_download_handler(
     let downloaded = timeline
         .download_layer(&layer_name)
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            other => ApiError::InternalServerError(other.into()),
+        })?;
 
     match downloaded {
         Some(true) => json_response(StatusCode::OK, ()),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 99e0ff1aa5..92313afba7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -340,7 +340,7 @@ impl Layer {
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> anyhow::Result<()> {
+    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
         self.0.get_or_maybe_download(true, None).await?;
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f387c81c29..827601fa8b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2028,8 +2028,16 @@ impl Timeline {
     pub(crate) async fn download_layer(
         &self,
         layer_file_name: &LayerName,
-    ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await? else {
+    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
+        let Some(layer) = self
+            .find_layer(layer_file_name)
+            .await
+            .map_err(|e| match e {
+                layer_manager::Shutdown => {
+                    super::storage_layer::layer::DownloadError::TimelineShutdown
+                }
+            })?
+        else {
             return Ok(None);
         };
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 028d1c2e49..c344f30f4d 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,6 +27,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import query_scalar, wait_until
+from urllib3 import Retry
 
 if TYPE_CHECKING:
     from typing import Any
@@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
             "compaction_period": "0s",
         }
     )
-    client = env.pageserver.http_client()
+
+    # Disable retries, because we'll hit code paths that can give us
+    # 503 and want to see that directly
+    client = env.pageserver.http_client(retries=Retry(status=0))
+
     failpoint = "before-downloading-layer-stream-pausable"
     client.configure_failpoints((failpoint, "pause"))
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*downloading failed, possibly for shutdown.*",
-        ]
-    )
-
     info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
     assert len(info.delta_layers()) == 1
 
@@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
 
         client.configure_failpoints((failpoint, "off"))
 
-        with pytest.raises(
-            PageserverApiException, match="downloading failed, possibly for shutdown"
-        ):
+        with pytest.raises(PageserverApiException, match="Shutting down"):
             download.result()
 
-        env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")
-
         detach.result()
 
         client.configure_failpoints((failpoint, "pause"))

From 5e0c40709f8a18477454beb0fe7cb6d9d522dbf7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 30 Jan 2025 22:45:43 +0000
Subject: [PATCH 70/72] storcon: refine chaos selection logic (#10600)

## Problem

In https://github.com/neondatabase/neon/pull/10438 it was pointed out
that it would be good to avoid picking tenants in ID order, and also to
avoid situations where we might double-select the same tenant.

There was an initial swing at this in
https://github.com/neondatabase/neon/pull/10443, where Chi suggested a
simpler approach which is done in this PR

## Summary of changes

- Split total set of tenants into in and out of home AZ
- Consume out of home AZ first, and if necessary shuffle + consume from
out of home AZ
---
 .../src/service/chaos_injector.rs             | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 98034421d6..91d7183fde 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -96,29 +96,38 @@ impl ChaosInjector {
         let batch_size = 128;
         let mut inner = self.service.inner.write().unwrap();
         let (nodes, tenants, scheduler) = inner.parts_mut();
-        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
 
         // Prefer to migrate tenants that are currently outside their home AZ.  This avoids the chaos injector
         // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
         // random tenants to move, and then on next chaos iteration moving them back, then picking some new
         // random tenants on the next iteration.
+        let (out_of_home_az, in_home_az): (Vec<_>, Vec<_>) = tenants
+            .values()
+            .map(|shard| {
+                (
+                    shard.tenant_shard_id,
+                    shard.is_attached_outside_preferred_az(nodes),
+                )
+            })
+            .partition(|(_id, is_outside)| *is_outside);
+
+        let mut out_of_home_az: Vec<_> = out_of_home_az.into_iter().map(|(id, _)| id).collect();
+        let mut in_home_az: Vec<_> = in_home_az.into_iter().map(|(id, _)| id).collect();
+
         let mut victims = Vec::with_capacity(batch_size);
-        for shard in tenants.values() {
-            if shard.is_attached_outside_preferred_az(nodes) {
-                victims.push(shard.tenant_shard_id);
-            }
+        if out_of_home_az.len() >= batch_size {
+            tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len());
 
-            if victims.len() >= batch_size {
-                break;
-            }
+            out_of_home_az.shuffle(&mut thread_rng());
+            victims.extend(out_of_home_az.into_iter().take(batch_size));
+        } else {
+            tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()));
+
+            victims.extend(out_of_home_az);
+            in_home_az.shuffle(&mut thread_rng());
+            victims.extend(in_home_az.into_iter().take(batch_size - victims.len()));
         }
 
-        let choose_random = batch_size.saturating_sub(victims.len());
-        tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
-
-        let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
-        victims.extend(random_victims);
-
         for victim in victims {
             self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
         }

From df87a55609d2c9824c7445ef59a1391157ff6b55 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 30 Jan 2025 23:55:17 +0100
Subject: [PATCH 71/72] tests: Speed up test_pgdata_import_smoke on Postgres
 v17 (#10567)

The test runs this query:

    select count(*), sum(data::bigint)::bigint from t

to validate the test results between each part of the test. It performs
a simple sequential scan and aggregation, but was taking an order of
magnitude longer on v17 than on previous Postgres versions, which
sometimes caused the test to time out. There were two reasons for that:

1. On v17, the planner estimates the table to have only only one row. In
reality it has 305790 rows, and older versions estimated it at 611580,
which is not too bad given that the table has not been analyzed so the
planner bases that estimate just on the number of pages and the widths
of the datatypes. The new estimate of 1 row is much worse, and it leads
the planner to disregard parallel plans, whereas on older versions you
got a Parallel Seq Scan.

I tracked this down to upstream commit 29cf61ade3, "Consider fillfactor
when estimating relation size". With that commit,
table_block_relation_estimate_size() function calculates that each page
accommodates less than 1 row when the fillfactor is taken into account,
which rounds down to 0. In reality, the executor will always place at
least one row on a page regardless of fillfactor, but the new estimation
formula doesn't take that into account.

I reported this to pgsql-hackers
(https://www.postgresql.org/message-id/2bf9d973-7789-4937-a7ca-0af9fb49c71e%40iki.fi),
we don't need to do anything more about it in neon. It's OK to not use
parallel scans here; once issue 2. below is addressed, the queries are
fast enough without parallelism..

2. On v17, prefetching was not happening for the sequential scan. That's
because starting with v17, buffers are reserved in the shared buffer
cache before prefetching is initiated, and we use a tiny
shared_buffers=1MB setting in the tests. The prefetching is effectively
disabled with such a small shared_buffers setting, to protect the system
from completely starving out of buffers.

   To address that, simply bump up shared_buffers in the test.

This patch addresses the second issue, which is enough to fix the
problem.
---
 test_runner/regress/test_import_pgdata.py | 24 +++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py
index 086d4b67c9..6b35f3c6bb 100644
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -70,6 +70,12 @@ def test_pgdata_import_smoke(
     env.pageserver.stop()
     env.pageserver.start()
 
+    # By default our tests run with a tiny shared_buffers=1MB setting. That
+    # doesn't allow any prefetching on v17 and above, where the new streaming
+    # read machinery keeps buffers pinned while prefetching them.  Use a higher
+    # setting to enable prefetching and speed up the tests
+    ep_config = ["shared_buffers=64MB"]
+
     #
     # Put data in vanilla pg
     #
@@ -246,7 +252,11 @@ def test_pgdata_import_smoke(
     #
 
     ro_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn
+        branch_name=import_branch_name,
+        endpoint_id="ro",
+        tenant_id=tenant_id,
+        lsn=last_record_lsn,
+        config_lines=ep_config,
     )
 
     validate_vanilla_equivalence(ro_endpoint)
@@ -276,7 +286,10 @@ def test_pgdata_import_smoke(
     # validate that we can write
     #
     rw_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id
+        branch_name=import_branch_name,
+        endpoint_id="rw",
+        tenant_id=tenant_id,
+        config_lines=ep_config,
     )
     rw_endpoint.safe_psql("create table othertable(values text)")
     rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
@@ -296,7 +309,7 @@ def test_pgdata_import_smoke(
         ancestor_start_lsn=rw_lsn,
     )
     br_tip_endpoint = env.endpoints.create_start(
-        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
     )
     validate_vanilla_equivalence(br_tip_endpoint)
     br_tip_endpoint.safe_psql("select * from othertable")
@@ -309,7 +322,10 @@ def test_pgdata_import_smoke(
         ancestor_start_lsn=initdb_lsn,
     )
     br_initdb_endpoint = env.endpoints.create_start(
-        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
+        branch_name="br-initdb",
+        endpoint_id="br-initdb-ro",
+        tenant_id=tenant_id,
+        config_lines=ep_config,
     )
     validate_vanilla_equivalence(br_initdb_endpoint)
     with pytest.raises(psycopg2.errors.UndefinedTable):

From a018878e2761d1766d2d91f1e5f5bf7ddb84b4ac Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 31 Jan 2025 06:02:08 +0000
Subject: [PATCH 72/72] Storage release 2025-01-31