From 7c74112b2a6e23c07bfd9cc62c240cd6bbdd3bd9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Aug 2024 11:04:42 +0100
Subject: [PATCH 001/159] pageserver: batch InMemoryLayer `put`s, remove need
 to sort items by LSN during ingest (#8591)

## Problem/Solution

TimelineWriter::put_batch is simply a loop over individual puts. Each
put acquires and releases locks, and checks for potentially starting a
new layer. Batching these is more efficient, but more importantly
unlocks future changes where we can pre-build serialized buffers much
earlier in the ingest process, potentially even on the safekeeper
(imagine a future model where some variant of DatadirModification lives
on the safekeeper).

Ensuring that the values in put_batch are written to one layer also
enables a simplification upstream, where we no longer need to write
values in LSN-order. This saves us a sort, but also simplifies follow-on
refactors to DatadirModification: we can store metadata keys and data
keys separately at that level without needing to zip them together in
LSN order later.

## Why?

In this PR, these changes are simplify optimizations, but they are
motivated by evolving the ingest path in the direction of disentangling
extracting DatadirModification from Timeline. It may not obvious how
right now, but the general idea is that we'll end up with three phases
of ingest:
- A) Decode walrecords and build a datadirmodification with all the
simple data contents already in a big serialized buffer ready to write
to an ephemeral layer **<-- this part can be pipelined and parallelized,
and done on a safekeeper!**
- B) Let that datadirmodification see a Timeline, so that it can also
generate all the metadata updates that require a read-modify-write of
existing pages
- C) Dump the results of B into an ephemeral layer.

Related: https://github.com/neondatabase/neon/issues/8452

## Caveats

Doing a big monolithic buffer of values to write to disk is ordinarily
an anti-pattern: we prefer nice streaming I/O. However:
- In future, when we do this first decode stage on the safekeeper, it
would be inefficient to serialize a Vec of Value, and then later
deserialize it just to add blob size headers while writing into the
ephemeral layer format. The idea is that for bulk write data, we will
serialize exactly once.
- The monolithic buffer is a stepping stone to pipelining more of this:
by seriailizing earlier (rather than at the final put_value), we will be
able to parallelize the wal decoding and bulk serialization of data page
writes.
- The ephemeral layer's buffered writer already stalls writes while it
waits to flush: so while yes we'll stall for a couple milliseconds to
write a couple megabytes, we already have stalls like this, just
distributed across smaller writes.

## Benchmarks

This PR is primarily a stepping stone to safekeeper ingest filtering,
but also provides a modest efficiency improvement to the `wal_recovery`
part of `test_bulk_ingest`.

test_bulk_ingest:

```
test_bulk_insert[neon-release-pg16].insert: 23.659 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 626 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 18.981 s
test_bulk_insert[neon-release-pg16].compaction: 0.055 s

vs. tip of main:
test_bulk_insert[neon-release-pg16].insert: 24.001 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 604 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 23.586 s
test_bulk_insert[neon-release-pg16].compaction: 0.054 s
```
---
 pageserver/benches/bench_ingest.rs            |  19 ++-
 pageserver/src/pgdatadir_mapping.rs           |  70 +++++++---
 pageserver/src/tenant/ephemeral_file.rs       |  35 +++--
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../tenant/storage_layer/inmemory_layer.rs    | 131 +++++++++++++-----
 pageserver/src/tenant/timeline.rs             | 101 +++++++-------
 .../walreceiver/walreceiver_connection.rs     |   9 +-
 7 files changed, 247 insertions(+), 120 deletions(-)

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 0336302de0..bd99f5289d 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -10,6 +10,7 @@ use pageserver::{
     page_cache,
     repository::Value,
     task_mgr::TaskKind,
+    tenant::storage_layer::inmemory_layer::SerializedBatch,
     tenant::storage_layer::InMemoryLayer,
     virtual_file,
 };
@@ -67,12 +68,16 @@ async fn ingest(
     let layer =
         InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
 
-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
+    let data_ser_size = data.serialized_size().unwrap() as usize;
     let ctx = RequestContext::new(
         pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
         pageserver::context::DownloadBehavior::Download,
     );
 
+    const BATCH_SIZE: usize = 16;
+    let mut batch = Vec::new();
+
     for i in 0..put_count {
         lsn += put_size as u64;
 
@@ -95,7 +100,17 @@ async fn ingest(
             }
         }
 
-        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
+        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
+        if batch.len() >= BATCH_SIZE {
+            let this_batch = std::mem::take(&mut batch);
+            let serialized = SerializedBatch::from_values(this_batch);
+            layer.put_batch(serialized, &ctx).await?;
+        }
+    }
+    if !batch.is_empty() {
+        let this_batch = std::mem::take(&mut batch);
+        let serialized = SerializedBatch::from_values(this_batch);
+        layer.put_batch(serialized, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4f7eb1a00c..d6e0b82e1d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,12 +15,11 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
     relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -174,6 +172,7 @@ impl Timeline {
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
             pending_directory_entries: Vec::new(),
+            pending_bytes: 0,
             lsn,
         }
     }
@@ -1022,21 +1021,33 @@ pub struct DatadirModification<'a> {
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
     pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
+
+    /// An **approximation** of how large our EphemeralFile write will be when committed.
+    pending_bytes: usize,
 }
 
 impl<'a> DatadirModification<'a> {
+    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
+    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
+    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
+    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
+
     /// Get the current lsn
     pub(crate) fn get_lsn(&self) -> Lsn {
         self.lsn
     }
 
+    pub(crate) fn approx_pending_bytes(&self) -> usize {
+        self.pending_bytes
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -1769,21 +1780,25 @@ impl<'a> DatadirModification<'a> {
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
         for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
+            let mut write_batch = Vec::new();
+            for (lsn, value_ser_size, value) in values {
                 if key.is_rel_block_key() || key.is_slru_block_key() {
                     // This bails out on first error without modifying pending_updates.
                     // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
+                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
                 } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
+                    retained_pending_updates.entry(key).or_default().push((
+                        lsn,
+                        value_ser_size,
+                        value,
+                    ));
                 }
             }
+            writer.put_batch(write_batch, ctx).await?;
         }
 
         self.pending_updates = retained_pending_updates;
+        self.pending_bytes = 0;
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1809,17 +1824,20 @@ impl<'a> DatadirModification<'a> {
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            // Ordering: the items in this batch do not need to be in any global order, but values for
+            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+            // this to do efficient updates to its index.
+            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
+                .pending_updates
+                .drain()
+                .flat_map(|(key, values)| {
+                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
+                        (key.to_compact(), lsn, val_ser_size, value)
+                    })
+                })
+                .collect::<Vec<_>>();
 
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
@@ -1844,6 +1862,8 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
+        self.pending_bytes = 0;
+
         Ok(())
     }
 
@@ -1860,7 +1880,7 @@ impl<'a> DatadirModification<'a> {
         // Note: we don't check pending_deletions. It is an error to request a
         // value that has been removed, deletion only avoids leaking storage.
         if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
+            if let Some((_, _, value)) = values.last() {
                 return if let Value::Image(img) = value {
                     Ok(img.clone())
                 } else {
@@ -1888,13 +1908,17 @@ impl<'a> DatadirModification<'a> {
     fn put(&mut self, key: Key, val: Value) {
         let values = self.pending_updates.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
             if *last_lsn == self.lsn {
+                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                 *last_value = val;
                 return;
             }
         }
-        values.push((self.lsn, val));
+
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+        self.pending_bytes += val_serialized_size;
+        values.push((self.lsn, val_serialized_size, val));
     }
 
     fn delete(&mut self, key_range: Range<Key>) {
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 3eb8384d05..44f0fc7ab1 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -79,6 +79,8 @@ impl EphemeralFile {
         self.rw.read_blk(blknum, ctx).await
     }
 
+    #[cfg(test)]
+    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
     pub(crate) async fn write_blob(
         &mut self,
         srcbuf: &[u8],
@@ -86,17 +88,30 @@ impl EphemeralFile {
     ) -> Result<u64, io::Error> {
         let pos = self.rw.bytes_written();
 
-        // Write the length field
-        if srcbuf.len() < 0x80 {
-            // short one-byte length header
-            let len_buf = [srcbuf.len() as u8];
+        let mut len_bytes = std::io::Cursor::new(Vec::new());
+        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
+            srcbuf.len(),
+            &mut len_bytes,
+        );
+        let len_bytes = len_bytes.into_inner();
 
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        } else {
-            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
-            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        }
+        // Write the length field
+        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+
+        // Write the payload
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+
+        Ok(pos)
+    }
+
+    /// Returns the offset at which the first byte of the input was written, for use
+    /// in constructing indices over the written value.
+    pub(crate) async fn write_raw(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();
 
         // Write the payload
         self.rw.write_all_borrowed(srcbuf, ctx).await?;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 04f89db401..133b34b8b5 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,7 +2,7 @@
 
 pub mod delta_layer;
 pub mod image_layer;
-pub(crate) mod inmemory_layer;
+pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 130d1002a0..a71b4dd83b 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -33,7 +33,7 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;
 
 use super::{
     DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
@@ -320,6 +320,82 @@ impl InMemoryLayer {
     }
 }
 
+/// Offset of a particular Value within a serialized batch.
+struct SerializedBatchOffset {
+    key: CompactKey,
+    lsn: Lsn,
+    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
+    offset: u64,
+}
+
+pub struct SerializedBatch {
+    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
+    pub(crate) raw: Vec<u8>,
+
+    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
+    offsets: Vec<SerializedBatchOffset>,
+
+    /// The highest LSN of any value in the batch
+    pub(crate) max_lsn: Lsn,
+}
+
+impl SerializedBatch {
+    /// Write a blob length in the internal format of the EphemeralFile
+    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
+        use std::io::Write;
+
+        if len < 0x80 {
+            // short one-byte length header
+            let len_buf = [len as u8];
+
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        } else {
+            let mut len_buf = u32::to_be_bytes(len as u32);
+            len_buf[0] |= 0x80;
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        }
+    }
+
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
+        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
+        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
+
+        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        for (key, lsn, val_ser_size, val) in batch {
+            let relative_off = cursor.position();
+
+            Self::write_blob_length(val_ser_size, &mut cursor);
+            val.ser_into(&mut cursor)
+                .expect("Writing into in-memory buffer is infallible");
+
+            offsets.push(SerializedBatchOffset {
+                key,
+                lsn,
+                offset: relative_off,
+            });
+            max_lsn = std::cmp::max(max_lsn, lsn);
+        }
+
+        let buffer = cursor.into_inner();
+
+        // Assert that we didn't do any extra allocations while building buffer.
+        debug_assert!(buffer.len() <= buffer_size);
+
+        Self {
+            raw: buffer,
+            offsets,
+            max_lsn,
+        }
+    }
+}
+
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
     write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -380,37 +456,20 @@ impl InMemoryLayer {
         })
     }
 
-    // Write operations
-
-    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
-    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    // Write path.
+    pub async fn put_batch(
         &self,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
+        serialized_batch: SerializedBatch,
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
-    }
 
-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-
-        let off = {
-            locked_inner
+        let base_off = {
+            inner
                 .file
-                .write_blob(
-                    buf,
+                .write_raw(
+                    &serialized_batch.raw,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -418,15 +477,23 @@ impl InMemoryLayer {
                 .await?
         };
 
-        let vec_map = locked_inner.index.entry(key).or_default();
-        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            warn!("Key {} at {} already exists", key, lsn);
+        for SerializedBatchOffset {
+            key,
+            lsn,
+            offset: relative_off,
+        } in serialized_batch.offsets
+        {
+            let off = base_off + relative_off;
+            let vec_map = inner.index.entry(key).or_default();
+            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+            if old.is_some() {
+                // We already had an entry for this LSN. That's odd..
+                warn!("Key {} at {} already exists", key, lsn);
+            }
         }
 
-        let size = locked_inner.file.len();
-        locked_inner.resource_units.maybe_publish_size(size);
+        let size = inner.file.len();
+        inner.resource_units.maybe_publish_size(size);
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 80e3843021..e90f65942f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -44,10 +44,8 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
-    bin_ser::BeSer,
     fs_ext, pausable_failpoint,
     sync::gate::{Gate, GateGuard},
-    vec_map::VecMap,
 };
 
 use std::pin::pin;
@@ -137,7 +135,10 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
+use super::{
+    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
+    upload_queue::NotInitialized,
+};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -5574,44 +5575,6 @@ enum OpenLayerAction {
 }
 
 impl<'a> TimelineWriter<'a> {
-    /// Put a new page version that can be constructed from a WAL record
-    ///
-    /// This will implicitly extend the relation, if the page is beyond the
-    /// current end-of-file.
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
-    }
-
     async fn handle_open_layer_action(
         &mut self,
         at: Lsn,
@@ -5717,18 +5680,58 @@ impl<'a> TimelineWriter<'a> {
     }
 
     /// Put a batch of keys at the specified Lsns.
-    ///
-    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
     pub(crate) async fn put_batch(
         &mut self,
-        batch: VecMap<Lsn, (Key, Value)>,
+        batch: Vec<(CompactKey, Lsn, usize, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        for (lsn, (key, val)) in batch {
-            self.put(key, lsn, &val, ctx).await?
+        if batch.is_empty() {
+            return Ok(());
         }
 
-        Ok(())
+        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
+        let batch_max_lsn = serialized_batch.max_lsn;
+        let buf_size: u64 = serialized_batch.raw.len() as u64;
+
+        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
+        let layer = self
+            .handle_open_layer_action(batch_max_lsn, action, ctx)
+            .await?;
+
+        let res = layer.put_batch(serialized_batch, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(batch_max_lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
+        }
+
+        res
+    }
+
+    #[cfg(test)]
+    /// Test helper, for tests that would like to poke individual values without composing a batch
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+        let val_ser_size = value.serialized_size().unwrap() as usize;
+        self.put_batch(
+            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
+            ctx,
+        )
+        .await
     }
 
     pub(crate) async fn delete_batch(
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index b5c577af72..0114473eda 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
     metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr::TaskKind,
-    task_mgr::WALRECEIVER_RUNTIME,
+    pgdatadir_mapping::DatadirModification,
+    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
     walrecord::DecodedWALRecord,
@@ -345,7 +345,10 @@ pub(super) async fn handle_walreceiver_connection(
                         // Commit every ingest_batch_size records. Even if we filtered out
                         // all records, we still need to call commit to advance the LSN.
                         uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
+                        if uncommitted_records >= ingest_batch_size
+                            || modification.approx_pending_bytes()
+                                > DatadirModification::MAX_PENDING_BYTES
+                        {
                             WAL_INGEST
                                 .records_committed
                                 .inc_by(uncommitted_records - filtered_records);

From d645645fab662df28ffb41dde18ca1963c237532 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:45:29 +0200
Subject: [PATCH 002/159] Sleep in test_scrubber_physical_gc (#8798)

This copies a piece of code from `test_scrubber_physical_gc_ancestors`
to fix a source of flakiness: later on we rely on stuff being older than
a second, but the test can run faster under optimal conditions (as
happened to me locally, but also obvservable in
[this](https://neon-github-public-dev.s3.amazonaws.com/reports/main/10470762360/index.html#testresult/f713b02657db4b4c/retries)
allure report):

```
test_runner/regress/test_storage_scrubber.py:169: in test_scrubber_physical_gc
    assert gc_summary["remote_storage_errors"] == 0
E   assert 1 == 0
```
---
 test_runner/regress/test_storage_scrubber.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 2844d1b1d2..292a9a1010 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -152,6 +152,9 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
         # This write includes remote upload, will generate an index in this generation
         workload.write_rows(1)
 
+    # We will use a min_age_secs=1 threshold for deletion, let it pass
+    time.sleep(2)
+
     # With a high min_age, the scrubber should decline to delete anything
     gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600)
     assert gc_summary["remote_storage_errors"] == 0

From 0e6c0d47a5d29e151d1a8013e627998df8772f6f Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 22 Aug 2024 12:52:36 +0200
Subject: [PATCH 003/159] Revert "Use sycnhronous commit for logical
 replicaiton worker (#8645)" (#8792)

This reverts commit cbe8c77997aea576a96a7f8d31147cb7a11d6a6b.

This change was originally made to test a hypothesis, but after that,
the proper fix #8669 was merged, so now it's not needed. Moreover, the
test is still flaky, so probably this bug was not a reason of the
flakiness.

Related to #8097
---
 test_runner/regress/test_subscriber_restart.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 4581008022..91caad7220 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -37,9 +37,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
             scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
             # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
             pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
-            # synchronous_commit=on to test a hypothesis for why this test has been flaky.
-            # XXX: Add link to the issue
-            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)"
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
             scur.execute(query)
             time.sleep(2)  # let initial table sync complete
 

From 1a9d559be8a77e7d8375c10238e4e4c0e76a40f7 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 22 Aug 2024 13:29:05 +0200
Subject: [PATCH 004/159] proxy: Enable stricter/pedantic clippy checks (#8775)

Create a list of currently allowed exceptions that should be reduced
over time.
---
 proxy/src/lib.rs | 90 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ea92eaaa55..b7d497ebcc 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,4 +1,92 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
+// rustc lints/lint groups
+// https://doc.rust-lang.org/rustc/lints/groups.html
+#![deny(
+    deprecated,
+    future_incompatible,
+    // TODO: consider let_underscore
+    nonstandard_style,
+    rust_2024_compatibility
+)]
+#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
+// List of denied lints from the clippy::restriction group.
+// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
+#![warn(
+    clippy::undocumented_unsafe_blocks,
+    clippy::dbg_macro,
+    clippy::empty_enum_variants_with_brackets,
+    clippy::exit,
+    clippy::float_cmp_const,
+    clippy::lossy_float_literal,
+    clippy::macro_use_imports,
+    clippy::manual_ok_or,
+    // TODO: consider clippy::map_err_ignore
+    // TODO: consider clippy::mem_forget
+    clippy::rc_mutex,
+    clippy::rest_pat_in_fully_bound_structs,
+    clippy::string_add,
+    clippy::string_to_string,
+    clippy::todo,
+    // TODO: consider clippy::unimplemented
+    // TODO: consider clippy::unwrap_used
+)]
+// List of permanently allowed lints.
+#![allow(
+    // It's ok to cast u8 to bool, etc.
+    clippy::cast_lossless,
+)]
+// List of temporarily allowed lints.
+// TODO: Switch to except() once stable with 1.81.
+// TODO: fix code and reduce list or move to permanent list above.
+#![allow(
+    clippy::cargo_common_metadata,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss,
+    clippy::default_trait_access,
+    clippy::doc_markdown,
+    clippy::explicit_iter_loop,
+    clippy::float_cmp,
+    clippy::if_not_else,
+    clippy::ignored_unit_patterns,
+    clippy::implicit_hasher,
+    clippy::inconsistent_struct_constructor,
+    clippy::inline_always,
+    clippy::items_after_statements,
+    clippy::manual_assert,
+    clippy::manual_let_else,
+    clippy::manual_string_new,
+    clippy::match_bool,
+    clippy::match_same_arms,
+    clippy::match_wild_err_arm,
+    clippy::missing_errors_doc,
+    clippy::missing_panics_doc,
+    clippy::module_name_repetitions,
+    clippy::multiple_crate_versions,
+    clippy::must_use_candidate,
+    clippy::needless_for_each,
+    clippy::needless_pass_by_value,
+    clippy::needless_raw_string_hashes,
+    clippy::option_as_ref_cloned,
+    clippy::redundant_closure_for_method_calls,
+    clippy::redundant_else,
+    clippy::return_self_not_must_use,
+    clippy::similar_names,
+    clippy::single_char_pattern,
+    clippy::single_match_else,
+    clippy::struct_excessive_bools,
+    clippy::struct_field_names,
+    clippy::too_many_lines,
+    clippy::uninlined_format_args,
+    clippy::unnested_or_patterns,
+    clippy::unreadable_literal,
+    clippy::unused_async,
+    clippy::unused_self,
+    clippy::used_underscore_binding,
+    clippy::wildcard_imports
+)]
+// List of temporarily allowed lints to unblock beta/nightly.
+#![allow(unknown_lints, clippy::manual_inspect)]
 
 use std::convert::Infallible;
 

From b1c457898b7af111cd59d3a8c2d3bde5bae5085e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 22 Aug 2024 18:38:03 +0300
Subject: [PATCH 005/159] test_compatibility: flush in the end (#8804)

`test_forward_compatibility` is still often failing at graceful
shutdown. Fix this by explicit flush before shutdown.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10506613738/index.html#testresult/5e7111907f7ecfb2/

Cc: #8655 and #8708
Previous attempt: #8787
---
 test_runner/regress/test_compatibility.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index de27191945..c361efe90a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -11,7 +11,12 @@ import pytest
 import toml
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, flush_ep_to_pageserver
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    flush_ep_to_pageserver,
+)
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -296,7 +301,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     pg_version = env.pg_version
 
     # Stop endpoint while we recreate timeline
-    ep.stop()
+    flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
 
     try:
         pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
@@ -344,6 +349,11 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     assert not dump_from_wal_differs, "dump from WAL differs"
     assert not initial_dump_differs, "initial dump differs"
 
+    flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(
+        tenant_id, timeline_id, compact=False, wait_until_uploaded=True
+    )
+
 
 def dump_differs(
     first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None

From 7a485b599bd27ba135e3327bfb5710c495c99df6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 22 Aug 2024 23:53:37 +0300
Subject: [PATCH 006/159] Fix race condition in LRU list update in
 get_cached_relsize (#8807)

## Problem

See https://neondb.slack.com/archives/C07J14D8GTX/p1724347552023709
Manipulations with LRU list in relation size cache are performed under
shared lock

## Summary of changes

Take exclusive lock

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/relsize_cache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
index cc7ac2c394..2a4c2dc799 100644
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -110,7 +110,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 
 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_SHARED);
+		/* We need exclusive lock here because of LRU list manipulation */
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
 		if (entry != NULL)
 		{

From 6eb638f4b390270fa004cdea45e00ca63c21f773 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 22 Aug 2024 17:31:38 -0400
Subject: [PATCH 007/159] feat(pageserver): warn on aux v1 tenants + default to
 v2 (#8625)

part of https://github.com/neondatabase/neon/issues/8623

We want to discover potential aux v1 customers that we might have missed
from the migrations.

## Summary of changes

Log warnings on basebackup, load timeline, and the first put_file.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             |  2 +-
 pageserver/src/pgdatadir_mapping.rs           | 15 +++++++++++--
 pageserver/src/tenant.rs                      | 14 ++++++------
 pageserver/src/tenant/timeline.rs             |  5 +++++
 .../regress/test_logical_replication.py       | 22 +++++--------------
 5 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ab4adfbebe..d55c06b685 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
 
     /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
     pub fn default_tenant_config() -> Self {
-        Self::V1
+        Self::V2
     }
 }
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d6e0b82e1d..b7110d69b6 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -726,7 +726,17 @@ impl Timeline {
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         let current_policy = self.last_aux_file_policy.load();
         match current_policy {
-            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V1) => {
+                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
+                self.list_aux_files_v1(lsn, ctx).await
+            }
+            None => {
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                if !res.is_empty() {
+                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
+                }
+                Ok(res)
+            }
             Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
             Some(AuxFilePolicy::CrossValidation) => {
                 let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -1587,6 +1597,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
+                    warn!("this timeline is using deprecated aux file policy V1");
                     self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
@@ -2048,7 +2059,7 @@ mod tests {
 
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
         let tline = tline.raw_timeline().unwrap();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 65a7504b74..2e19a46ac8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5932,10 +5932,10 @@ mod tests {
             .await
             .unwrap();
 
-        // the default aux file policy to switch is v1 if not set by the admins
+        // the default aux file policy to switch is v2 if not set by the admins
         assert_eq!(
             harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::V1
+            AuxFilePolicy::default_tenant_config()
         );
         let (tenant, ctx) = harness.load().await;
 
@@ -5979,8 +5979,8 @@ mod tests {
         );
         assert_eq!(
             tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+            Some(AuxFilePolicy::V2),
+            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
         );
 
         // we can read everything from the storage
@@ -6002,8 +6002,8 @@ mod tests {
 
         assert_eq!(
             tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep v1 storage format when new files are written"
+            Some(AuxFilePolicy::V2),
+            "keep v2 storage format when new files are written"
         );
 
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6019,7 +6019,7 @@ mod tests {
 
         // child copies the last flag even if that is not on remote storage yet
         assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
 
         let files = child.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e90f65942f..dc9cddea43 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2234,6 +2234,11 @@ impl Timeline {
 
                 handles: Default::default(),
             };
+
+            if aux_file_policy == Some(AuxFilePolicy::V1) {
+                warn!("this timeline is using deprecated aux file policy V1");
+            }
+
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 0d18aa43b7..f83a833dda 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -22,7 +22,7 @@ def random_string(n: int):
 
 
 @pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation]
+    "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation]
 )
 def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
     env = neon_simple_env
@@ -31,9 +31,7 @@ def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy:
         assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -175,9 +173,7 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     def slot_removed(ep):
         assert (
@@ -355,9 +351,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 #
 # Most pages start with a contrecord, so we don't do anything special
 # to ensure that.
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -402,9 +396,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -476,9 +468,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env

From ae63ac74887b9658c7a80f369b43247c1db51165 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 12 Aug 2024 14:57:50 -0500
Subject: [PATCH 008/159] Write messages field by field instead of bytes sheet
 in test_simple_sync_safekeepers

Co-authored-by: Arseny Sher <ars@neon.tech>
---
 libs/walproposer/build.rs           |   1 +
 libs/walproposer/src/walproposer.rs | 102 ++++++++++++++++++++--------
 2 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 3126b170a4..7bb077062b 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -95,6 +95,7 @@ fn main() -> anyhow::Result<()> {
         .allowlist_var("ERROR")
         .allowlist_var("FATAL")
         .allowlist_var("PANIC")
+        .allowlist_var("PG_VERSION_NUM")
         .allowlist_var("WPEVENT")
         .allowlist_var("WL_LATCH_SET")
         .allowlist_var("WL_SOCKET_READABLE")
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 37b1e0fa87..ba75171db2 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -282,7 +282,11 @@ mod tests {
     use std::cell::UnsafeCell;
     use utils::id::TenantTimelineId;
 
-    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
+    use crate::{
+        api_bindings::Level,
+        bindings::{NeonWALReadResult, PG_VERSION_NUM},
+        walproposer::Wrapper,
+    };
 
     use super::ApiImpl;
 
@@ -489,41 +493,79 @@ mod tests {
 
         let (sender, receiver) = sync_channel(1);
 
+        // Messages definitions are at walproposer.h
+        // xxx: it would be better to extract them from safekeeper crate and
+        // use serialization/deserialization here.
+        let greeting_tag = (b'g' as u64).to_ne_bytes();
+        let proto_version = 2_u32.to_ne_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let system_id = 0_u64.to_ne_bytes();
+        let tenant_id = ttid.tenant_id.as_arr();
+        let timeline_id = ttid.timeline_id.as_arr();
+        let pg_tli = 1_u32.to_ne_bytes();
+        let wal_seg_size = 16777216_u32.to_ne_bytes();
+        let proposer_greeting = [
+            greeting_tag.as_slice(),
+            proto_version.as_slice(),
+            pg_version.as_slice(),
+            proposer_id.as_slice(),
+            system_id.as_slice(),
+            tenant_id.as_slice(),
+            timeline_id.as_slice(),
+            pg_tli.as_slice(),
+            wal_seg_size.as_slice(),
+        ]
+        .concat();
+
+        let voting_tag = (b'v' as u64).to_ne_bytes();
+        let vote_request_term = 3_u64.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let vote_request = [
+            voting_tag.as_slice(),
+            vote_request_term.as_slice(),
+            proposer_id.as_slice(),
+        ]
+        .concat();
+
+        let acceptor_greeting_term = 2_u64.to_ne_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
+        let acceptor_greeting = [
+            greeting_tag.as_slice(),
+            acceptor_greeting_term.as_slice(),
+            acceptor_greeting_node_id.as_slice(),
+        ]
+        .concat();
+
+        let vote_response_term = 3_u64.to_ne_bytes();
+        let vote_given = 1_u64.to_ne_bytes();
+        let flush_lsn = 0x539_u64.to_ne_bytes();
+        let truncate_lsn = 0x539_u64.to_ne_bytes();
+        let th_len = 1_u32.to_ne_bytes();
+        let th_term = 2_u64.to_ne_bytes();
+        let th_lsn = 0x539_u64.to_ne_bytes();
+        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
+        let vote_response = [
+            voting_tag.as_slice(),
+            vote_response_term.as_slice(),
+            vote_given.as_slice(),
+            flush_lsn.as_slice(),
+            truncate_lsn.as_slice(),
+            th_len.as_slice(),
+            th_term.as_slice(),
+            th_lsn.as_slice(),
+            timeline_start_lsn.as_slice(),
+        ]
+        .concat();
+
         let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
             wait_events: Cell::new(WaitEventsData {
                 sk: std::ptr::null_mut(),
                 event_mask: 0,
             }),
-            expected_messages: vec![
-                // TODO: When updating Postgres versions, this test will cause
-                // problems. Postgres version in message needs updating.
-                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
-                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
-                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
-                ],
-                // VoteRequest(VoteRequest { term: 3 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            expected_messages: vec![proposer_greeting, vote_request],
             expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![
-                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                ],
-                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
-                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
-                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            safekeeper_replies: vec![acceptor_greeting, vote_response],
             replies_ptr: AtomicUsize::new(0),
             sync_channel: sender,
             shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),

From 6744ed19d8cc8cd09c6ccbbf66953e6ebb7a480d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:20:11 -0500
Subject: [PATCH 009/159] Update Postgres 14 to 14.13

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 3fd7a45f8a..b6910406e2 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737
+Subproject commit b6910406e2d05a2c94baa2e530ec882733047759
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 6e3e489b5d..c2b5fb8915 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -8,7 +8,7 @@
     "46b4b235f38413ab5974bb22c022f9b829257674"
   ],
   "v14": [
-    "14.12",
-    "3fd7a45f8aae85c080df6329e3c85887b7f3a737"
+    "14.13",
+    "b6910406e2d05a2c94baa2e530ec882733047759"
   ]
 }

From 66db381dc9b9238618165c7ef36fa29a0577806c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:27:05 -0500
Subject: [PATCH 010/159] Update Postgres 15 to 15.8

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 46b4b235f3..76063bff63 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674
+Subproject commit 76063bff638ccce7afa99fc9037ac51338b9823d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2b5fb8915..2921372c24 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -4,8 +4,8 @@
     "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
   ],
   "v15": [
-    "15.7",
-    "46b4b235f38413ab5974bb22c022f9b829257674"
+    "15.8",
+    "76063bff638ccce7afa99fc9037ac51338b9823d"
   ],
   "v14": [
     "14.13",

From 2f8d548a125c490b29eb4a6ab4d79ce358300e74 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:44:01 -0500
Subject: [PATCH 011/159] Update Postgres 16 to 16.4

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 47a9122a5a..8efa089aa7 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a
+Subproject commit 8efa089aa7786381543a4f9efc69b92d43eab8c0
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 2921372c24..50cc99c2f1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v16": [
-    "16.3",
-    "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
+    "16.4",
+    "8efa089aa7786381543a4f9efc69b92d43eab8c0"
   ],
   "v15": [
     "15.8",

From f7ab3ffcb781c14bf35da8260518456d00cea04d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 14:05:06 -0500
Subject: [PATCH 012/159] Check that TERM != dumb before using colors in
 pre-commit.py

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pre-commit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pre-commit.py b/pre-commit.py
index c5ed63ac44..ae432e8225 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -2,6 +2,7 @@
 
 import argparse
 import enum
+import os
 import subprocess
 import sys
 from typing import List
@@ -93,7 +94,7 @@ if __name__ == "__main__":
         "--no-color",
         action="store_true",
         help="disable colored output",
-        default=not sys.stdout.isatty(),
+        default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
     )
     args = parser.parse_args()
 

From dbdb8a1187d28cf98c93c9cc39c348db6d7e98f1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 23 Aug 2024 09:15:55 +0300
Subject: [PATCH 013/159] Document how to use "git merge" for PostgreSQL minor
 version upgrades. (#8692)

Our new policy is to use the "rebase" method and slice all the Neon
commits into a nice patch set when doing a new major version, and use
"merge" method on minor version upgrades on the release branches.

"git merge" preserves the git history of Neon commits on the Postgres
branches. While it's nice to rebase all the Neon changes to a logical
patch set against upstream, having to do it between every minor release
is a fair amount work, and it loses the history, and is more
error-prone.
---
 docs/updating-postgres.md | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/docs/updating-postgres.md b/docs/updating-postgres.md
index 1868bbf5f7..7913b0a9e2 100644
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.
 
     ```shell
-    git checkout -b my-branch REL_15_STABLE_neon
+    git checkout -b my-branch-15 REL_15_STABLE_neon
     ```
 
-1. Tag the last commit on the stable branch you are updating.
+1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
 
-    ```shell
-    git tag REL_15_3_neon
-    ```
-
-1. Push the new tag to the Neon Postgres repository.
-
-    ```shell
-    git push origin REL_15_3_neon
-    ```
-
-1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
-
-1. Rebase the branch you created on the tag and resolve any conflicts.
+1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
 
     ```shell
     git fetch upstream REL_15_4
-    git rebase REL_15_4
+    git merge REL_15_4
     ```
 
+    In the commit message of the merge commit, mention if there were
+    any non-trivial conflicts or other issues.
+
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.
 
@@ -57,7 +48,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.
 
     ```shell
-    git push origin my-branch
+    git push origin my-branch-15
     ```
 
 1. Clone the Neon repository if you have not done so already.
@@ -74,7 +65,7 @@ branch.
 1. Update the Git submodule.
 
     ```shell
-    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
     git submodule update --remote vendor/postgres-v15
     ```
 
@@ -89,14 +80,12 @@ minor Postgres release.
 
 1. Create a pull request, and wait for CI to go green.
 
-1. Force push the rebased Postgres branches into the Neon Postgres repository.
+1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
 
     ```shell
-    git push --force origin my-branch:REL_15_STABLE_neon
+    git push origin my-branch-15:REL_15_STABLE_neon
     ```
 
-    It may require disabling various branch protections.
-
 1. Update your Neon PR to point at the branches.
 
     ```shell

From d8ca495eae816ddfd5a06fed4e1e668fe1edad91 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 23 Aug 2024 12:48:26 +0200
Subject: [PATCH 014/159] Require poetry >=1.8 (#8812)

This was already a requirement for installing the python packages after
https://github.com/neondatabase/neon/pull/8609 got merged, so this
updates the documentation to reflect that.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f01442da5d..735edef0fc 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
 
 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
 
 
 #### Running neon database

From e80ab8fd6a99bf46463695986b9f19e2cb06c8d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 23 Aug 2024 13:14:14 +0200
Subject: [PATCH 015/159] Update serde_json to 1.0.125 (#8813)

Updates `serde_json` to `1.0.125`, rolling out speedups added by a
serde_json contributor.

Release [link](https://github.com/serde-rs/json/releases/tag/1.0.125).
Blog post
[link](https://purplesyringa.moe/blog/i-sped-up-serde-json-strings-by-20-percent/).
---
 Cargo.lock | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a506da8c02..250427da2b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5590,11 +5590,12 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.96"
+version = "1.0.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
+checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
 dependencies = [
  "itoa",
+ "memchr",
  "ryu",
  "serde",
 ]

From e62cd9e121928eca4f1f6b3ded4f5deb7e0a6110 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 23 Aug 2024 14:29:11 +0100
Subject: [PATCH 016/159] CI(autocomment): add arch to build type (#8809)

## Problem

Failed / flaky tests for different arches don't have any difference in
GitHub Autocomment

## Summary of changes
- Add arch to build type for GitHub autocomment
---
 scripts/comment-test-report.js | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index f42262cf48..e8e0b3c23a 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -68,16 +68,29 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                     console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
 
                     buildType = "release"
-                    pgVersion = "14"
+                    pgVersion = "16"
                 }
 
                 pgVersions.add(pgVersion)
 
+                // We use `arch` as it is returned by GitHub Actions
+                //  (RUNNER_ARCH env var): X86, X64, ARM, or ARM64
+                // Ref https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+                let arch = ""
+                if (test.parameters.includes("'X64'")) {
+                    arch = "x86-64"
+                } else if (test.parameters.includes("'ARM64'")) {
+                    arch = "arm64"
+                } else {
+                    arch = "unknown"
+                }
+
                 // Removing build type and PostgreSQL version from the test name to make it shorter
                 const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
                 test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
                 test.pgVersion = pgVersion
                 test.buildType = buildType
+                test.arch = arch
 
                 if (test.status === "passed") {
                     passedTests[pgVersion][testName].push(test)
@@ -144,7 +157,7 @@ const reportSummary = async (params) => {
                 const links = []
                 for (const test of tests) {
                     const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
-                    links.push(`[${test.buildType}](${allureLink})`)
+                    links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
                 }
                 summary += `- \`${testName}\`: ${links.join(", ")}\n`
             }
@@ -175,7 +188,7 @@ const reportSummary = async (params) => {
                     const links = []
                     for (const test of tests) {
                         const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        links.push(`[${test.buildType}](${allureLink})`)
+                        links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
                     }
                     summary += `- \`${testName}\`: ${links.join(", ")}\n`
                 }

From 6a74bcadecd0ce4f088b5a22c6183ff980559d87 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 23 Aug 2024 09:32:00 -0400
Subject: [PATCH 017/159] feat(pageserver): remove features=testing restriction
 for compact (#8815)

A small PR to make it possible to run force compaction in staging for
btm-gc compaction testing.

Part of https://github.com/neondatabase/neon/issues/8002

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a4da8506d6..4635e76ea9 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1706,11 +1706,6 @@ async fn timeline_compact_handler(
         flags |= CompactFlags::ForceImageLayerCreation;
     }
     if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
-        if !cfg!(feature = "testing") {
-            return Err(ApiError::InternalServerError(anyhow!(
-                "enhanced_gc_bottom_most_compaction is only available in testing mode"
-            )));
-        }
         flags |= CompactFlags::EnhancedGcBottomMostCompaction;
     }
     let wait_until_uploaded =
@@ -2942,7 +2937,7 @@ pub fn make_router(
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
+            |r| api_handler(r, timeline_compact_handler),
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",

From bc8cfe1b5567715995b884231bc2785a32307ce8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 23 Aug 2024 09:42:45 -0400
Subject: [PATCH 018/159] fix(pageserver): l0 check criteria (#8797)

close https://github.com/neondatabase/neon/issues/8579

## Summary of changes

The `is_l0` check now takes both layer key range and the layer type.
This allows us to have image layers covering the full key range in
btm-most compaction (upcoming PR). However, we still don't allow delta
layers to cover the full key range, and I will make btm-most compaction
to generate delta layers with the key range of the keys existing in the
layer instead of `Key::MIN..Key::HACK_MAX` (upcoming PR).


Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/layer_map.rs                | 10 +++++-----
 pageserver/src/tenant/storage_layer/layer.rs      |  5 ++++-
 pageserver/src/tenant/storage_layer/layer_name.rs |  4 ++++
 pageserver/src/tenant/timeline.rs                 |  9 ++++++---
 storage_scrubber/src/checks.rs                    |  2 +-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 844f117ea2..707233b003 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -464,7 +464,7 @@ impl LayerMap {
     pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
         // TODO: See #3869, resulting #4088, attempted fix and repro #4094
 
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             self.l0_delta_layers.push(layer_desc.clone().into());
         }
 
@@ -483,7 +483,7 @@ impl LayerMap {
         self.historic
             .remove(historic_layer_coverage::LayerKey::from(layer_desc));
         let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             let len_before = self.l0_delta_layers.len();
             let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
             l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -600,8 +600,8 @@ impl LayerMap {
     }
 
     /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>) -> bool {
-        key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
+        is_delta_layer && key_range == &(Key::MIN..Key::MAX)
     }
 
     /// This function determines which layers are counted in `count_deltas`:
@@ -628,7 +628,7 @@ impl LayerMap {
     ///      than just the current partition_range.
     pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
         // Case 1
-        if !Self::is_l0(&layer.key_range) {
+        if !Self::is_l0(&layer.key_range, layer.is_delta) {
             return true;
         }
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 774f97e1d9..2607b574e7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1296,7 +1296,10 @@ impl LayerInner {
                 lsn_end: lsn_range.end,
                 remote: !resident,
                 access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(
+                    &self.layer_desc().key_range,
+                    self.layer_desc().is_delta,
+                ),
             }
         } else {
             let lsn = self.desc.image_layer_lsn();
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index f33ca076ab..47ae556279 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -256,6 +256,10 @@ impl LayerName {
             LayerName::Delta(layer) => &layer.key_range,
         }
     }
+
+    pub fn is_delta(&self) -> bool {
+        matches!(self, LayerName::Delta(_))
+    }
 }
 
 impl fmt::Display for LayerName {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dc9cddea43..b33e436fce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3002,7 +3002,10 @@ impl Timeline {
         // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
         //   the layer is likely to be covered by an image layer during compaction.
         layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
+            std::cmp::Reverse((
+                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
+                desc.lsn_range.end,
+            ))
         });
 
         let layers = layers
@@ -4585,7 +4588,7 @@ impl Timeline {
                 // for compact_level0_phase1 creating an L0, which does not happen in practice
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
                 return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
             } else {
                 insert_layers.push(l.clone());
@@ -5877,7 +5880,7 @@ mod tests {
             };
 
             // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range()) {
+            if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
                 assert!(layer_lsn <= last_lsn);
                 last_lsn = layer_lsn;
             }
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index b35838bcf7..08b0f06ebf 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -150,7 +150,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
                             if response.is_err() {
                                 // Object is not present.
-                                let is_l0 = LayerMap::is_l0(layer.key_range());
+                                let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
 
                                 let msg = format!(
                                     "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",

From 73286e6b9f8a0ba4fb00dd4b44e613963b62cb21 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 23 Aug 2024 16:43:08 +0300
Subject: [PATCH 019/159] test: copy dict to avoid error on retry (#8811)

there is no "const" in python, so when we modify the global dict, it
will remain that way on the retry. fix to not have it influence other
tests which might be run on the same runner.

evidence:
<https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8625/10513146742/index.html#/testresult/453c4ce05ada7496>
---
 test_runner/fixtures/pageserver/utils.py       | 17 +++++++++++------
 test_runner/regress/test_s3_restore.py         |  6 +++---
 test_runner/regress/test_storage_controller.py |  4 ++--
 test_runner/regress/test_tenant_delete.py      |  8 ++++----
 test_runner/regress/test_timeline_delete.py    |  4 ++--
 5 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index b75a480a63..a74fef6a60 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -430,12 +430,17 @@ def enable_remote_storage_versioning(
     return response
 
 
-MANY_SMALL_LAYERS_TENANT_CONFIG = {
-    "gc_period": "0s",
-    "compaction_period": "0s",
-    "checkpoint_distance": 1024**2,
-    "image_creation_threshold": 100,
-}
+def many_small_layers_tenant_config() -> Dict[str, Any]:
+    """
+    Create a new dict to avoid issues with deleting from the global value.
+    In python, the global is mutable.
+    """
+    return {
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        "checkpoint_distance": 1024**2,
+        "image_creation_threshold": 100,
+    }
 
 
 def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 9992647e56..c1a80a54bc 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -8,9 +8,9 @@ from fixtures.neon_fixtures import (
     PgBin,
 )
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     enable_remote_storage_versioning,
+    many_small_layers_tenant_config,
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -33,7 +33,7 @@ def test_tenant_s3_restore(
 
     # change it back after initdb, recovery doesn't work if the two
     # index_part.json uploads happen at same second or too close to each other.
-    initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG
+    initial_tenant_conf = many_small_layers_tenant_config()
     del initial_tenant_conf["checkpoint_distance"]
 
     env = neon_env_builder.init_start(initial_tenant_conf)
@@ -50,7 +50,7 @@ def test_tenant_s3_restore(
     tenant_id = env.initial_tenant
 
     # now lets create the small layers
-    ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG)
+    ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config())
 
     # Default tenant and the one we created
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 94d71a7677..b3464b0c91 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -23,11 +23,11 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
     enable_remote_storage_versioning,
     list_prefix,
+    many_small_layers_tenant_config,
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
@@ -654,7 +654,7 @@ def test_storage_controller_s3_time_travel_recovery(
         tenant_id,
         shard_count=2,
         shard_stripe_size=8192,
-        tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG,
+        tenant_config=many_small_layers_tenant_config(),
     )
 
     # Check that the consistency check passes
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index dadf5ca672..448a28dc31 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -9,9 +9,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
+    many_small_layers_tenant_config,
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -76,7 +76,7 @@ def test_tenant_delete_smoke(
 
     env.neon_cli.create_tenant(
         tenant_id=tenant_id,
-        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
+        conf=many_small_layers_tenant_config(),
     )
 
     # Default tenant and the one we created
@@ -215,7 +215,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     # (and there is no way to reconstruct the used remote storage kind)
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
     ps_http = env.pageserver.http_client()
     tenant_id = env.initial_tenant
 
@@ -330,7 +330,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
 
     ps_http = env.pageserver.http_client()
     # create a tenant separate from the main tenant so that we have one remaining
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 6d96dda391..328131cd08 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -16,9 +16,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
+    many_small_layers_tenant_config,
     poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -782,7 +782,7 @@ def test_timeline_delete_resumed_on_attach(
     remote_storage_kind = s3_storage()
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
 
     tenant_id = env.initial_tenant
 

From bcc68a7866c633d74a482266bfe34053a093b9d8 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 23 Aug 2024 14:48:06 +0100
Subject: [PATCH 020/159] storcon_cli: add support for drain and fill
 operations (#8791)

## Problem
We have been naughty and curl-ed storcon to fix-up drains and fills.

## Summary of changes
Add support for starting/cancelling drain/fill operations via
`storcon_cli`.
---
 control_plane/storcon_cli/src/main.rs | 135 ++++++++++++++++++++++++--
 storage_controller/src/http.rs        |   1 -
 2 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index e27491c1c8..35510ccbca 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -147,9 +147,9 @@ enum Command {
         #[arg(long)]
         threshold: humantime::Duration,
     },
-    // Drain a set of specified pageservers by moving the primary attachments to pageservers
+    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
     // outside of the specified set.
-    Drain {
+    BulkMigrate {
         // Set of pageserver node ids to drain.
         #[arg(long)]
         nodes: Vec<NodeId>,
@@ -163,6 +163,34 @@ enum Command {
         #[arg(long)]
         dry_run: Option<bool>,
     },
+    /// Start draining the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartDrain {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel draining the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelDrain {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
+    /// Start filling the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartFill {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel filling the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelFill {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
 }
 
 #[derive(Parser)]
@@ -249,6 +277,34 @@ impl FromStr for NodeAvailabilityArg {
     }
 }
 
+async fn wait_for_scheduling_policy<F>(
+    client: Client,
+    node_id: NodeId,
+    timeout: Duration,
+    f: F,
+) -> anyhow::Result<NodeSchedulingPolicy>
+where
+    F: Fn(NodeSchedulingPolicy) -> bool,
+{
+    let waiter = tokio::time::timeout(timeout, async move {
+        loop {
+            let node = client
+                .dispatch::<(), NodeDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/node/{node_id}"),
+                    None,
+                )
+                .await?;
+
+            if f(node.scheduling) {
+                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
+            }
+        }
+    });
+
+    Ok(waiter.await??)
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
@@ -628,7 +684,7 @@ async fn main() -> anyhow::Result<()> {
                 })
                 .await?;
         }
-        Command::Drain {
+        Command::BulkMigrate {
             nodes,
             concurrency,
             max_shards,
@@ -657,7 +713,7 @@ async fn main() -> anyhow::Result<()> {
             }
 
             if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Drain requested for node which doesn't exist.")
+                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
             }
 
             node_to_fill_descs.retain(|desc| {
@@ -669,7 +725,7 @@ async fn main() -> anyhow::Result<()> {
             });
 
             if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to drain to")
+                anyhow::bail!("There are no nodes to migrate to")
             }
 
             // Set the node scheduling policy to draining for the nodes which
@@ -690,7 +746,7 @@ async fn main() -> anyhow::Result<()> {
                     .await?;
             }
 
-            // Perform the drain: move each tenant shard scheduled on a node to
+            // Perform the migration: move each tenant shard scheduled on a node to
             // be drained to a node which is being filled. A simple round robin
             // strategy is used to pick the new node.
             let tenants = storcon_client
@@ -703,13 +759,13 @@ async fn main() -> anyhow::Result<()> {
 
             let mut selected_node_idx = 0;
 
-            struct DrainMove {
+            struct MigrationMove {
                 tenant_shard_id: TenantShardId,
                 from: NodeId,
                 to: NodeId,
             }
 
-            let mut moves: Vec<DrainMove> = Vec::new();
+            let mut moves: Vec<MigrationMove> = Vec::new();
 
             let shards = tenants
                 .into_iter()
@@ -739,7 +795,7 @@ async fn main() -> anyhow::Result<()> {
                     continue;
                 }
 
-                moves.push(DrainMove {
+                moves.push(MigrationMove {
                     tenant_shard_id: shard.tenant_shard_id,
                     from: shard
                         .node_attached
@@ -816,6 +872,67 @@ async fn main() -> anyhow::Result<()> {
                 failure
             );
         }
+        Command::StartDrain { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::PUT,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+            println!("Drain started for {node_id}");
+        }
+        Command::CancelDrain { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active | PauseForRestart)
+                })
+                .await?;
+
+            println!(
+                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
+        Command::StartFill { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
+                .await?;
+
+            println!("Fill started for {node_id}");
+        }
+        Command::CancelFill { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/fill"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active)
+                })
+                .await?;
+
+            println!(
+                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
     }
 
     Ok(())
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 7bbd1541cf..207bd5a1e6 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1074,7 +1074,6 @@ pub fn make_router(
                 RequestName("control_v1_metadata_health_list_outdated"),
             )
         })
-        // TODO(vlad): endpoint for cancelling drain and fill
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(

From 612b643315fbda4b489ae512b14d9bd66a4fbacb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 16:28:22 +0100
Subject: [PATCH 021/159] update diesel (#8816)

https://rustsec.org/advisories/RUSTSEC-2024-0365
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 250427da2b..441ca1ff86 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1672,9 +1672,9 @@ dependencies = [
 
 [[package]]
 name = "diesel"
-version = "2.2.1"
+version = "2.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
+checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
 dependencies = [
  "bitflags 2.4.1",
  "byteorder",

From f4cac1f30f096ceb8c1fa4a3281319883d10be6e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 24 Aug 2024 00:38:42 +0800
Subject: [PATCH 022/159] impr(pageserver): error if keys are unordered in
 merge iter (#8818)

In case of corrupted delta layers, we can detect the corruption and bail
out the compaction.

## Summary of changes

* Detect wrong delta desc of key range
* Detect unordered deltas

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   | 16 +++++++++++
 .../src/tenant/storage_layer/image_layer.rs   | 15 +++++++++++
 .../tenant/storage_layer/merge_iterator.rs    | 27 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6c2391d72d..b1b5217f7f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -232,6 +232,18 @@ pub struct DeltaLayerInner {
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
+impl DeltaLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "delta {}..{} {}..{}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn_range().start,
+            self.lsn_range().end
+        )
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("DeltaLayerInner")
@@ -1527,6 +1539,10 @@ pub struct DeltaLayerIterator<'a> {
 }
 
 impl<'a> DeltaLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.delta_layer.layer_dbg_info()
+    }
+
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
         assert!(self.key_values_batch.is_empty());
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 9a19e4e2c7..94120a4e3e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -167,6 +167,17 @@ pub struct ImageLayerInner {
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
+impl ImageLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "image {}..{} {}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn()
+        )
+    }
+}
+
 impl std::fmt::Debug for ImageLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("ImageLayerInner")
@@ -1024,6 +1035,10 @@ pub struct ImageLayerIterator<'a> {
 }
 
 impl<'a> ImageLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.image_layer.layer_dbg_info()
+    }
+
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
         assert!(self.key_values_batch.is_empty());
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index b4bd976033..d2c341e5ce 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -3,6 +3,7 @@ use std::{
     collections::{binary_heap, BinaryHeap},
 };
 
+use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 
@@ -26,6 +27,13 @@ impl<'a> LayerRef<'a> {
             Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
         }
     }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }
 
 enum LayerIterRef<'a> {
@@ -40,6 +48,13 @@ impl LayerIterRef<'_> {
             Self::Image(x) => x.next().await,
         }
     }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }
 
 /// This type plays several roles at once
@@ -75,6 +90,11 @@ impl<'a> PeekableLayerIterRef<'a> {
     async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
         let result = self.peeked.take();
         self.peeked = self.iter.next().await?;
+        if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
+            if (k1, l1) < (k2, l2) {
+                bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
+            }
+        }
         Ok(result)
     }
 }
@@ -178,7 +198,12 @@ impl<'a> IteratorWrapper<'a> {
         let iter = PeekableLayerIterRef::create(iter).await?;
         if let Some((k1, l1, _)) = iter.peek() {
             let (k2, l2) = first_key_lower_bound;
-            debug_assert!((k1, l1) >= (k2, l2));
+            if (k1, l1) < (k2, l2) {
+                bail!(
+                    "layer key range did not include the first key in the layer: {}",
+                    layer.layer_dbg_info()
+                );
+            }
         }
         *self = Self::Loaded { iter };
         Ok(())

From c1cb7a0fa0d0bb6b58aa0f3e0979905476a19225 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 18:01:02 +0100
Subject: [PATCH 023/159] proxy: flesh out JWT verification code (#8805)

This change adds in the necessary verification steps for the JWT
payload, and adds per-role querying of JWKs as needed for #8736
---
 proxy/src/auth/backend/jwt.rs | 295 +++++++++++++++++++++++-----------
 1 file changed, 203 insertions(+), 92 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index e021a7e23f..49d5de16c3 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,15 +1,21 @@
-use std::{future::Future, sync::Arc, time::Duration};
+use std::{
+    future::Future,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
 
 use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
+use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;
 
-use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
 
 // TODO(conrad): make these configurable.
+const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
@@ -17,30 +23,56 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 
 /// How to get the JWT auth rules
 pub trait FetchAuthRules: Clone + Send + Sync + 'static {
-    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+    fn fetch_auth_rules(
+        &self,
+        role_name: RoleName,
+    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
 
-#[derive(Clone)]
-struct FetchAuthRulesFromCplane {
-    #[allow(dead_code)]
-    endpoint: EndpointIdInt,
-}
-
-impl FetchAuthRules for FetchAuthRulesFromCplane {
-    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-        Err(anyhow::anyhow!("not yet implemented"))
-    }
-}
-
-pub struct AuthRules {
-    jwks_urls: Vec<url::Url>,
+pub struct AuthRule {
+    pub id: String,
+    pub jwks_url: url::Url,
+    pub audience: Option<String>,
 }
 
 #[derive(Default)]
 pub struct JwkCache {
     client: reqwest::Client,
 
-    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<String, KeySet>,
+}
+
+impl JwkCacheEntry {
+    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
+        self.key_sets.values().find_map(|key_set| {
+            key_set
+                .find_key(key_id)
+                .map(|jwk| (jwk, key_set.audience.as_deref()))
+        })
+    }
+}
+
+struct KeySet {
+    jwks: jose_jwk::JwkSet,
+    audience: Option<String>,
+}
+
+impl KeySet {
+    fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
+        self.jwks
+            .keys
+            .iter()
+            .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
+    }
 }
 
 pub struct JwkCacheEntryLock {
@@ -57,15 +89,6 @@ impl Default for JwkCacheEntryLock {
     }
 }
 
-pub struct JwkCacheEntry {
-    /// Should refetch at least every hour to verify when old keys have been removed.
-    /// Should refetch when new key IDs are seen only every 5 minutes or so
-    last_retrieved: Instant,
-
-    /// cplane will return multiple JWKs urls that we need to scrape.
-    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
-}
-
 impl JwkCacheEntryLock {
     async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
         JwkRenewalPermit::acquire_permit(self).await
@@ -79,6 +102,7 @@ impl JwkCacheEntryLock {
         &self,
         _permit: JwkRenewalPermit<'_>,
         client: &reqwest::Client,
+        role_name: RoleName,
         auth_rules: &F,
     ) -> anyhow::Result<Arc<JwkCacheEntry>> {
         // double check that no one beat us to updating the cache.
@@ -91,20 +115,19 @@ impl JwkCacheEntryLock {
             }
         }
 
-        let rules = auth_rules.fetch_auth_rules().await?;
-        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
-            rules.jwks_urls.len(),
-            ahash::RandomState::new(),
-        );
+        let rules = auth_rules.fetch_auth_rules(role_name).await?;
+        let mut key_sets =
+            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
         // TODO(conrad): run concurrently
         // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
-        for url in rules.jwks_urls {
-            let req = client.get(url.clone());
+        for rule in rules {
+            let req = client.get(rule.jwks_url.clone());
             // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
             match req.send().await.and_then(|r| r.error_for_status()) {
                 // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                 // I expect these failures would be quite sparse.
-                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
                 Ok(r) => {
                     let resp: http::Response<reqwest::Body> = r.into();
                     match parse_json_body_with_limit::<jose_jwk::JwkSet>(
@@ -113,9 +136,17 @@ impl JwkCacheEntryLock {
                     )
                     .await
                     {
-                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Err(e) => {
+                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
+                        }
                         Ok(jwks) => {
-                            key_sets.insert(url, jwks);
+                            key_sets.insert(
+                                rule.id,
+                                KeySet {
+                                    jwks,
+                                    audience: rule.audience,
+                                },
+                            );
                         }
                     }
                 }
@@ -133,7 +164,9 @@ impl JwkCacheEntryLock {
 
     async fn get_or_update_jwk_cache<F: FetchAuthRules>(
         self: &Arc<Self>,
+        ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        role_name: RoleName,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
         let now = Instant::now();
@@ -141,18 +174,20 @@ impl JwkCacheEntryLock {
 
         // if we have no cached JWKs, try and get some
         let Some(cached) = guard else {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
         };
 
         let last_update = now.duration_since(cached.last_retrieved);
 
         // check if the cached JWKs need updating.
         if last_update > MAX_RENEW {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
 
             // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
         }
 
         // every 5 minutes we should spawn a job to eagerly update the token.
@@ -164,7 +199,7 @@ impl JwkCacheEntryLock {
                 let client = client.clone();
                 let fetch = fetch.clone();
                 tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
                         tracing::warn!(error=?e, "could not fetch JWKs in background job");
                     }
                 });
@@ -178,8 +213,10 @@ impl JwkCacheEntryLock {
 
     async fn check_jwt<F: FetchAuthRules>(
         self: &Arc<Self>,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        jwt: &str,
         client: &reqwest::Client,
+        role_name: RoleName,
         fetch: &F,
     ) -> Result<(), anyhow::Error> {
         // JWT compact form is defined to be
@@ -189,36 +226,36 @@ impl JwkCacheEntryLock {
         let (header_payload, signature) = jwt
             .rsplit_once(".")
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let (header, _payload) = header_payload
+        let (header, payload) = header_payload
             .split_once(".")
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
+        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         ensure!(header.typ == "JWT");
-        let kid = header.kid.context("missing key id")?;
+        let kid = header.key_id.context("missing key id")?;
 
-        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+        let mut guard = self
+            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
+            .await?;
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
-        let jwk = loop {
-            let jwk = guard
-                .key_sets
-                .values()
-                .flat_map(|jwks| &jwks.keys)
-                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
-
-            match jwk {
+        let (jwk, expected_audience) = loop {
+            match guard.find_jwk_and_audience(kid) {
                 Some(jwk) => break jwk,
                 None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+
                     let permit = self.acquire_permit().await;
-                    guard = self.renew_jwks(permit, client, fetch).await?;
+                    guard = self
+                        .renew_jwks(permit, client, role_name.clone(), fetch)
+                        .await?;
                 }
                 _ => {
                     bail!("jwk not found");
@@ -227,7 +264,7 @@ impl JwkCacheEntryLock {
         };
 
         ensure!(
-            jwk.is_supported(&header.alg),
+            jwk.is_supported(&header.algorithm),
             "signature algorithm not supported"
         );
 
@@ -241,31 +278,60 @@ impl JwkCacheEntryLock {
             key => bail!("unsupported key type {key:?}"),
         };
 
-        // TODO(conrad): verify iss, exp, nbf, etc...
+        let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        tracing::debug!(?payload, "JWT signature valid with claims");
+
+        match (expected_audience, payload.audience) {
+            // check the audience matches
+            (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
+            // the audience is expected but is missing
+            (Some(_), None) => bail!("invalid JWT token audience"),
+            // we don't care for the audience field
+            (None, _) => {}
+        }
+
+        let now = SystemTime::now();
+
+        if let Some(exp) = payload.expiration {
+            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
+        }
+
+        if let Some(nbf) = payload.not_before {
+            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
+        }
 
         Ok(())
     }
 }
 
 impl JwkCache {
-    pub async fn check_jwt(
+    pub async fn check_jwt<F: FetchAuthRules>(
         &self,
-        endpoint: EndpointIdInt,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+        role_name: RoleName,
+        fetch: &F,
+        jwt: &str,
     ) -> Result<(), anyhow::Error> {
         // try with just a read lock first
-        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
+        let key = (endpoint, role_name.clone());
+        let entry = self.map.get(&key).as_deref().map(Arc::clone);
         let entry = match entry {
             Some(entry) => entry,
             None => {
                 // acquire a write lock after to insert.
-                let entry = self.map.entry(endpoint).or_default();
+                let entry = self.map.entry(key).or_default();
                 Arc::clone(&*entry)
             }
         };
 
-        let fetch = FetchAuthRulesFromCplane { endpoint };
-        entry.check_jwt(jwt, &self.client, &fetch).await
+        entry
+            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
+            .await
     }
 }
 
@@ -315,13 +381,49 @@ fn verify_rsa_signature(
 
 /// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
 #[derive(serde::Deserialize, serde::Serialize)]
-struct JWTHeader<'a> {
+struct JwtHeader<'a> {
     /// must be "JWT"
+    #[serde(rename = "typ")]
     typ: &'a str,
     /// must be a supported alg
-    alg: jose_jwa::Algorithm,
+    #[serde(rename = "alg")]
+    algorithm: jose_jwa::Algorithm,
     /// key id, must be provided for our usecase
-    kid: Option<&'a str>,
+    #[serde(rename = "kid")]
+    key_id: Option<&'a str>,
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize, Debug)]
+struct JwtPayload<'a> {
+    /// Audience - Recipient for which the JWT is intended
+    #[serde(rename = "aud")]
+    audience: Option<&'a str>,
+    /// Expiration - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
+    expiration: Option<SystemTime>,
+    /// Not before - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
+    not_before: Option<SystemTime>,
+
+    // the following entries are only extracted for the sake of debug logging.
+    /// Issuer of the JWT
+    #[serde(rename = "iss")]
+    issuer: Option<&'a str>,
+    /// Subject of the JWT (the user)
+    #[serde(rename = "sub")]
+    subject: Option<&'a str>,
+    /// Unique token identifier
+    #[serde(rename = "jti")]
+    jwt_id: Option<&'a str>,
+    /// Unique session identifier
+    #[serde(rename = "sid")]
+    session_id: Option<&'a str>,
+}
+
+fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
+    let d = <Option<u64>>::deserialize(d)?;
+    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
 }
 
 struct JwkRenewalPermit<'a> {
@@ -388,6 +490,8 @@ impl Drop for JwkRenewalPermit<'_> {
 
 #[cfg(test)]
 mod tests {
+    use crate::RoleName;
+
     use super::*;
 
     use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
@@ -431,10 +535,10 @@ mod tests {
     }
 
     fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
-        let header = JWTHeader {
+        let header = JwtHeader {
             typ: "JWT",
-            alg: jose_jwa::Algorithm::Signing(sig),
-            kid: Some(&kid),
+            algorithm: jose_jwa::Algorithm::Signing(sig),
+            key_id: Some(&kid),
         };
         let body = typed_json::json! {{
             "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
@@ -524,33 +628,40 @@ mod tests {
         struct Fetch(SocketAddr);
 
         impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-                Ok(AuthRules {
-                    jwks_urls: vec![
-                        format!("http://{}/foo", self.0).parse().unwrap(),
-                        format!("http://{}/bar", self.0).parse().unwrap(),
-                    ],
-                })
+            async fn fetch_auth_rules(
+                &self,
+                _role_name: RoleName,
+            ) -> anyhow::Result<Vec<AuthRule>> {
+                Ok(vec![
+                    AuthRule {
+                        id: "foo".to_owned(),
+                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                    AuthRule {
+                        id: "bar".to_owned(),
+                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                ])
             }
         }
 
+        let role_name = RoleName::from("user");
+
         let jwk_cache = Arc::new(JwkCacheEntryLock::default());
 
-        jwk_cache
-            .check_jwt(jwt1, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt2, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt3, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt4, &client, &Fetch(addr))
-            .await
-            .unwrap();
+        for token in [jwt1, jwt2, jwt3, jwt4] {
+            jwk_cache
+                .check_jwt(
+                    &RequestMonitoring::test(),
+                    &token,
+                    &client,
+                    role_name.clone(),
+                    &Fetch(addr),
+                )
+                .await
+                .unwrap();
+        }
     }
 }

From b65a95f12ef958a509e30f0d650a820b4e2e8c58 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 23 Aug 2024 18:32:56 +0100
Subject: [PATCH 024/159] controller: use PageserverUtilization for scheduling
 (#8711)

## Problem

Previously, the controller only used the shard counts for scheduling.
This works well when hosting only many-sharded tenants, but works much
less well when hosting single-sharded tenants that have a greater
deviation in size-per-shard.

Closes: https://github.com/neondatabase/neon/issues/7798

## Summary of changes

- Instead of UtilizationScore, carry the full PageserverUtilization
through into the Scheduler.
- Use the PageserverUtilization::score() instead of shard count when
ordering nodes in scheduling.

Q: Why did test_sharding_split_smoke need updating in this PR?
A: There's an interesting side effect during shard splits: because we do
not decrement the shard count in the utilization when we de-schedule the
shards from before the split, the controller will now prefer to pick
_different_ nodes for shards compared with which ones held secondaries
before the split. We could use our knowledge of splitting to fix up the
utilizations more actively in this situation, but I'm leaning toward
leaving the code simpler, as in practical systems the impact of one
shard on the utilization of a node should be fairly low (single digit
%).
---
 libs/pageserver_api/src/controller_api.rs     |  21 +-
 libs/pageserver_api/src/models/utilization.rs |  67 +++++-
 pageserver/src/metrics.rs                     |   8 +
 pageserver/src/tenant.rs                      |  18 +-
 pageserver/src/utilization.rs                 |  10 +-
 storage_controller/src/heartbeater.rs         |  10 +-
 storage_controller/src/node.rs                |  24 +-
 storage_controller/src/scheduler.rs           | 225 +++++++++++++++---
 storage_controller/src/service.rs             |  39 +--
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/regress/test_sharding.py          |  15 +-
 11 files changed, 340 insertions(+), 101 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a50707a1b8..a9a57d77ce 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};
 
+use crate::models::PageserverUtilization;
 use crate::{
     models::{ShardParameters, TenantConfig},
     shard::{ShardStripeSize, TenantShardId},
@@ -140,23 +141,11 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-/// Utilisation score indicating how good a candidate a pageserver
-/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
-/// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
-pub struct UtilizationScore(pub u64);
-
-impl UtilizationScore {
-    pub fn worst() -> Self {
-        UtilizationScore(u64::MAX)
-    }
-}
-
-#[derive(Serialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
-    Active(UtilizationScore),
+    Active(PageserverUtilization),
     // Node is warming up, but we expect it to become available soon. Covers
     // the time span between the re-attach response being composed on the storage controller
     // and the first successful heartbeat after the processing of the re-attach response
@@ -195,7 +184,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
         match val {
             // Assume the worst utilisation score to begin with. It will later be updated by
             // the heartbeats.
-            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::Active => {
+                NodeAvailability::Active(PageserverUtilization::full())
+            }
             NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
             NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
         }
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 0fec221276..844a0cda5d 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
     pub max_shard_count: u32,
 
     /// Cached result of [`Self::score`]
-    pub utilization_score: u64,
+    pub utilization_score: Option<u64>,
 
     /// When was this snapshot captured, pageserver local time.
     ///
@@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
     Percent::new(0).unwrap()
 }
 
+pub type RawScore = u64;
+
 impl PageserverUtilization {
     const UTILIZATION_FULL: u64 = 1000000;
 
@@ -62,7 +64,7 @@ impl PageserverUtilization {
     /// - Negative values are forbidden
     /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
     ///   layer eviction.
-    pub fn score(&self) -> u64 {
+    pub fn score(&self) -> RawScore {
         let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
             * self.disk_usable_pct.get() as u64)
             / 100;
@@ -74,8 +76,30 @@ impl PageserverUtilization {
         std::cmp::max(disk_utilization_score, shard_utilization_score)
     }
 
-    pub fn refresh_score(&mut self) {
-        self.utilization_score = self.score();
+    pub fn cached_score(&mut self) -> RawScore {
+        match self.utilization_score {
+            None => {
+                let s = self.score();
+                self.utilization_score = Some(s);
+                s
+            }
+            Some(s) => s,
+        }
+    }
+
+    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
+    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
+    pub fn is_overloaded(score: RawScore) -> bool {
+        score >= Self::UTILIZATION_FULL
+    }
+
+    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
+        if self.shard_count < shard_count {
+            self.shard_count = shard_count;
+
+            // Dirty cache: this will be calculated next time someone retrives the score
+            self.utilization_score = None;
+        }
     }
 
     /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -88,7 +112,38 @@ impl PageserverUtilization {
             disk_usable_pct: Percent::new(100).unwrap(),
             shard_count: 1,
             max_shard_count: 1,
-            utilization_score: Self::UTILIZATION_FULL,
+            utilization_score: Some(Self::UTILIZATION_FULL),
+            captured_at: serde_system_time::SystemTime(SystemTime::now()),
+        }
+    }
+}
+
+/// Test helper
+pub mod test_utilization {
+    use super::PageserverUtilization;
+    use std::time::SystemTime;
+    use utils::{
+        serde_percent::Percent,
+        serde_system_time::{self},
+    };
+
+    // Parameters of the imaginary node used for test utilization instances
+    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
+    const TEST_SHARDS_MAX: u32 = 1000;
+
+    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
+    /// not abuse this function from non-test code.
+    ///
+    /// Emulates a node with a 1000 shard limit and a 1TB disk.
+    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
+        PageserverUtilization {
+            disk_usage_bytes: disk_wanted_bytes,
+            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
+            disk_wanted_bytes,
+            disk_usable_pct: Percent::new(100).unwrap(),
+            shard_count,
+            max_shard_count: TEST_SHARDS_MAX,
+            utilization_score: None,
             captured_at: serde_system_time::SystemTime(SystemTime::now()),
         }
     }
@@ -120,7 +175,7 @@ mod tests {
             disk_usage_bytes: u64::MAX,
             free_space_bytes: 0,
             disk_wanted_bytes: u64::MAX,
-            utilization_score: 13,
+            utilization_score: Some(13),
             disk_usable_pct: Percent::new(90).unwrap(),
             shard_count: 100,
             max_shard_count: 200,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 0a1a22b6e8..1f8634df93 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,14 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_utilization_score",
+        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_secondary_heatmap_total_size",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2e19a46ac8..3a7afff211 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3741,13 +3741,21 @@ impl Tenant {
     /// less than this (via eviction and on-demand downloads), but this function enables
     /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
     /// by keeping important things on local disk.
+    ///
+    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
+    /// than they report here, due to layer eviction.  Tenants with many active branches may
+    /// actually use more than they report here.
     pub(crate) fn local_storage_wanted(&self) -> u64 {
-        let mut wanted = 0;
         let timelines = self.timelines.lock().unwrap();
-        for timeline in timelines.values() {
-            wanted += timeline.metrics.visible_physical_size_gauge.get();
-        }
-        wanted
+
+        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
+        // reflects the observation that on tenants with multiple large branches, typically only one
+        // of them is used actively enough to occupy space on disk.
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .max()
+            .unwrap_or(0)
     }
 }
 
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 3c48c84598..a0223f3bce 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;
 
 use pageserver_api::models::PageserverUtilization;
 
-use crate::{config::PageServerConf, tenant::mgr::TenantManager};
+use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
 
 pub(crate) fn regenerate(
     conf: &PageServerConf,
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
         disk_usable_pct,
         shard_count,
         max_shard_count: MAX_SHARDS,
-        utilization_score: 0,
+        utilization_score: None,
         captured_at: utils::serde_system_time::SystemTime(captured_at),
     };
 
-    doc.refresh_score();
-
-    // TODO: make utilization_score into a metric
+    // Initialize `PageserverUtilization::utilization_score`
+    let score = doc.cached_score();
+    NODE_UTILIZATION_SCORE.set(score);
 
     Ok(doc)
 }
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index c0e27bafdb..b7e66d33eb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -6,10 +6,7 @@ use std::{
 };
 use tokio_util::sync::CancellationToken;
 
-use pageserver_api::{
-    controller_api::{NodeAvailability, UtilizationScore},
-    models::PageserverUtilization,
-};
+use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
 
 use thiserror::Error;
 use utils::id::NodeId;
@@ -147,7 +144,8 @@ impl HeartbeaterTask {
                 // goes through to the pageserver even when the node is marked offline.
                 // This doesn't impact the availability observed by [`crate::service::Service`].
                 let mut node_clone = node.clone();
-                node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                node_clone
+                    .set_availability(NodeAvailability::Active(PageserverUtilization::full()));
 
                 async move {
                     let response = node_clone
@@ -179,7 +177,7 @@ impl HeartbeaterTask {
                         node.get_availability()
                     {
                         PageserverState::WarmingUp {
-                            started_at: last_seen_at,
+                            started_at: *last_seen_at,
                         }
                     } else {
                         PageserverState::Offline
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index ea765ca123..61a44daca9 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -92,15 +92,15 @@ impl Node {
         }
     }
 
-    pub(crate) fn get_availability(&self) -> NodeAvailability {
-        self.availability
+    pub(crate) fn get_availability(&self) -> &NodeAvailability {
+        &self.availability
     }
 
     pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
         use AvailabilityTransition::*;
         use NodeAvailability::WarmingUp;
 
-        match self.get_availability_transition(availability) {
+        match self.get_availability_transition(&availability) {
             ToActive => {
                 // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                 // users of previously-cloned copies of the node will still see the old cancellation
@@ -115,8 +115,8 @@ impl Node {
             Unchanged | ToWarmingUpFromOffline => {}
         }
 
-        if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
-            self.availability = WarmingUp(std::cmp::max(crnt, proposed));
+        if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
+            self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
         } else {
             self.availability = availability;
         }
@@ -126,12 +126,12 @@ impl Node {
     /// into a description of the transition.
     pub(crate) fn get_availability_transition(
         &self,
-        availability: NodeAvailability,
+        availability: &NodeAvailability,
     ) -> AvailabilityTransition {
         use AvailabilityTransition::*;
         use NodeAvailability::*;
 
-        match (self.availability, availability) {
+        match (&self.availability, availability) {
             (Offline, Active(_)) => ToActive,
             (Active(_), Offline) => ToOffline,
             (Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
@@ -153,15 +153,15 @@ impl Node {
 
     /// Is this node elegible to have work scheduled onto it?
     pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let score = match self.availability {
-            NodeAvailability::Active(score) => score,
+        let utilization = match &self.availability {
+            NodeAvailability::Active(u) => u.clone(),
             NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
         };
 
         match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
             NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
             NodeSchedulingPolicy::Pause => MaySchedule::No,
             NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
         }
@@ -285,7 +285,7 @@ impl Node {
     pub(crate) fn describe(&self) -> NodeDescribeResponse {
         NodeDescribeResponse {
             id: self.id,
-            availability: self.availability.into(),
+            availability: self.availability.clone().into(),
             scheduling: self.scheduling,
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 843159010d..060e3cc6ca 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,6 +1,6 @@
 use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
-use pageserver_api::controller_api::UtilizationScore;
+use pageserver_api::models::PageserverUtilization;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,9 +20,9 @@ impl From<ScheduleError> for ApiError {
     }
 }
 
-#[derive(Serialize, Eq, PartialEq)]
+#[derive(Serialize)]
 pub enum MaySchedule {
-    Yes(UtilizationScore),
+    Yes(PageserverUtilization),
     No,
 }
 
@@ -282,6 +282,28 @@ impl Scheduler {
                 node.shard_count -= 1;
             }
         }
+
+        // Maybe update PageserverUtilization
+        match update {
+            RefCountUpdate::AddSecondary | RefCountUpdate::Attach => {
+                // Referencing the node: if this takes our shard_count above the utilzation structure's
+                // shard count, then artifically bump it: this ensures that the scheduler immediately
+                // recognizes that this node has more work on it, without waiting for the next heartbeat
+                // to update the utilization.
+                if let MaySchedule::Yes(utilization) = &mut node.may_schedule {
+                    utilization.adjust_shard_count_max(node.shard_count as u32);
+                }
+            }
+            RefCountUpdate::PromoteSecondary
+            | RefCountUpdate::Detach
+            | RefCountUpdate::RemoveSecondary
+            | RefCountUpdate::DemoteAttached => {
+                // De-referencing the node: leave the utilization's shard_count at a stale higher
+                // value until some future heartbeat after we have physically removed this shard
+                // from the node: this prevents the scheduler over-optimistically trying to schedule
+                // more work onto the node before earlier detaches are done.
+            }
+        }
     }
 
     // Check if the number of shards attached to a given node is lagging below
@@ -326,7 +348,18 @@ impl Scheduler {
         use std::collections::hash_map::Entry::*;
         match self.nodes.entry(node.get_id()) {
             Occupied(mut entry) => {
-                entry.get_mut().may_schedule = node.may_schedule();
+                // Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values
+                // to account for any shards scheduled on the controller but not yet visible to the pageserver.
+                let mut may_schedule = node.may_schedule();
+                match &mut may_schedule {
+                    MaySchedule::Yes(utilization) => {
+                        utilization.adjust_shard_count_max(entry.get().shard_count as u32);
+                    }
+                    MaySchedule::No => { // Nothing to tweak
+                    }
+                }
+
+                entry.get_mut().may_schedule = may_schedule;
             }
             Vacant(entry) => {
                 entry.insert(SchedulerNode {
@@ -363,7 +396,7 @@ impl Scheduler {
                 let may_schedule = self
                     .nodes
                     .get(node_id)
-                    .map(|n| n.may_schedule != MaySchedule::No)
+                    .map(|n| !matches!(n.may_schedule, MaySchedule::No))
                     .unwrap_or(false);
                 (*node_id, may_schedule)
             })
@@ -383,7 +416,7 @@ impl Scheduler {
     /// the same tenant on the same node.  This is a soft constraint: the context will never
     /// cause us to fail to schedule a shard.
     pub(crate) fn schedule_shard(
-        &self,
+        &mut self,
         hard_exclude: &[NodeId],
         context: &ScheduleContext,
     ) -> Result<NodeId, ScheduleError> {
@@ -391,31 +424,41 @@ impl Scheduler {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
             .nodes
-            .iter()
-            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
-                    None
-                } else {
-                    Some((
-                        *k,
-                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                        v.shard_count,
-                        v.attached_shard_count,
-                    ))
-                }
+            .iter_mut()
+            .filter_map(|(k, v)| match &mut v.may_schedule {
+                MaySchedule::No => None,
+                MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
+                MaySchedule::Yes(utilization) => Some((
+                    *k,
+                    context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
+                    utilization.cached_score(),
+                    v.attached_shard_count,
+                )),
             })
             .collect();
 
+        // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
+        // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
+        // we may place shards in the same tenant together on the same pageserver if all other pageservers are
+        // overloaded.
+        let non_overloaded_scores = scores
+            .iter()
+            .filter(|i| !PageserverUtilization::is_overloaded(i.2))
+            .copied()
+            .collect::<Vec<_>>();
+        if !non_overloaded_scores.is_empty() {
+            scores = non_overloaded_scores;
+        }
+
         // Sort by, in order of precedence:
         //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
-        //  the least number of attached shards.
-        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
-        //  with the lower total shard count.
+        //  2nd: Utilization score (this combines shard count and disk utilization)
+        //  3rd: Attached shard count.  When nodes have identical utilization (e.g. when populating some
+        //       empty nodes), this acts as an anti-affinity between attached shards.
         //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
+        scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
 
         if scores.is_empty() {
             // After applying constraints, no pageservers were left.
@@ -429,7 +472,7 @@ impl Scheduler {
                 for (node_id, node) in &self.nodes {
                     tracing::info!(
                         "Node {node_id}: may_schedule={} shards={}",
-                        node.may_schedule != MaySchedule::No,
+                        !matches!(node.may_schedule, MaySchedule::No),
                         node.shard_count
                     );
                 }
@@ -469,7 +512,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
+    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -486,7 +529,7 @@ pub(crate) mod test_utils {
                         format!("pghost-{i}"),
                         5432 + i as u16,
                     );
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
                     node
                 })
@@ -497,6 +540,8 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+
     use super::*;
 
     use crate::tenant_shard::IntentState;
@@ -557,4 +602,130 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    /// Test the PageserverUtilization's contribution to scheduling algorithm
+    fn scheduler_utilization() {
+        let mut nodes = test_utils::make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Need to keep these alive because they contribute to shard counts via RAII
+        let mut scheduled_intents = Vec::new();
+
+        let empty_context = ScheduleContext::default();
+
+        fn assert_scheduler_chooses(
+            expect_node: NodeId,
+            scheduled_intents: &mut Vec<IntentState>,
+            scheduler: &mut Scheduler,
+            context: &ScheduleContext,
+        ) {
+            let scheduled = scheduler.schedule_shard(&[], context).unwrap();
+            let mut intent = IntentState::new();
+            intent.set_attached(scheduler, Some(scheduled));
+            scheduled_intents.push(intent);
+            assert_eq!(scheduled, expect_node);
+        }
+
+        // Independent schedule calls onto empty nodes should round-robin, because each node's
+        // utilization's shard count is updated inline.  The order is determinsitic because when all other factors are
+        // equal, we order by node ID.
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+
+        // Manually setting utilization higher should cause schedule calls to round-robin the other nodes
+        // which have equal utilization.
+        nodes
+            .get_mut(&NodeId(1))
+            .unwrap()
+            .set_availability(NodeAvailability::Active(test_utilization::simple(
+                10,
+                1024 * 1024 * 1024,
+            )));
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+
+        // The scheduler should prefer nodes with lower affinity score,
+        // even if they have higher utilization (as long as they aren't utilized at >100%)
+        let mut context_prefer_node1 = ScheduleContext::default();
+        context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]);
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+
+        // If a node is over-utilized, it will not be used even if affinity scores prefer it
+        nodes
+            .get_mut(&NodeId(1))
+            .unwrap()
+            .set_availability(NodeAvailability::Active(test_utilization::simple(
+                20000,
+                1024 * 1024 * 1024,
+            )));
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+
+        for mut intent in scheduled_intents {
+            intent.clear(&mut scheduler);
+        }
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 453e96bad3..4b0c556824 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -44,7 +44,7 @@ use pageserver_api::{
         NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
         TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
         TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -542,7 +542,7 @@ impl Service {
             let locked = self.inner.read().unwrap();
             locked.nodes.clone()
         };
-        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
+        let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
 
         // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
@@ -556,10 +556,8 @@ impl Service {
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
-                if let Some(utilization) = nodes_online.get(node_id) {
-                    node.set_availability(NodeAvailability::Active(UtilizationScore(
-                        utilization.utilization_score,
-                    )));
+                if let Some(utilization) = nodes_online.remove(node_id) {
+                    node.set_availability(NodeAvailability::Active(utilization));
                     scheduler.node_upsert(node);
                 }
             }
@@ -925,9 +923,9 @@ impl Service {
             if let Ok(deltas) = res {
                 for (node_id, state) in deltas.0 {
                     let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
-                            UtilizationScore(utilization.utilization_score),
-                        ),
+                        PageserverState::Available { utilization, .. } => {
+                            NodeAvailability::Active(utilization)
+                        }
                         PageserverState::WarmingUp { started_at } => {
                             NodeAvailability::WarmingUp(started_at)
                         }
@@ -936,14 +934,17 @@ impl Service {
                             // while the heartbeat round was on-going. Hence, filter out
                             // offline transitions for WarmingUp nodes that are still within
                             // their grace period.
-                            if let Ok(NodeAvailability::WarmingUp(started_at)) =
-                                self.get_node(node_id).await.map(|n| n.get_availability())
+                            if let Ok(NodeAvailability::WarmingUp(started_at)) = self
+                                .get_node(node_id)
+                                .await
+                                .as_ref()
+                                .map(|n| n.get_availability())
                             {
                                 let now = Instant::now();
-                                if now - started_at >= self.config.max_warming_up_interval {
+                                if now - *started_at >= self.config.max_warming_up_interval {
                                     NodeAvailability::Offline
                                 } else {
-                                    NodeAvailability::WarmingUp(started_at)
+                                    NodeAvailability::WarmingUp(*started_at)
                                 }
                             } else {
                                 NodeAvailability::Offline
@@ -1625,7 +1626,7 @@ impl Service {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
         // later.
-        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+        node.set_availability(NodeAvailability::Active(PageserverUtilization::full()));
 
         let configs = match node
             .with_client_retries(
@@ -2473,7 +2474,7 @@ impl Service {
         .await;
 
         let node = {
-            let locked = self.inner.read().unwrap();
+            let mut locked = self.inner.write().unwrap();
             // Just a sanity check to prevent misuse: the API expects that the tenant is fully
             // detached everywhere, and nothing writes to S3 storage. Here, we verify that,
             // but only at the start of the process, so it's really just to prevent operator
@@ -2500,7 +2501,7 @@ impl Service {
                     return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
                 }
             }
-            let scheduler = &locked.scheduler;
+            let scheduler = &mut locked.scheduler;
             // Right now we only perform the operation on a single node without parallelization
             // TODO fan out the operation to multiple nodes for better performance
             let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
@@ -4761,7 +4762,7 @@ impl Service {
         //
         // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
         // nothing else can mutate its availability while we run.
-        let availability_transition = if let Some(input_availability) = availability {
+        let availability_transition = if let Some(input_availability) = availability.as_ref() {
             let (activate_node, availability_transition) = {
                 let locked = self.inner.read().unwrap();
                 let Some(node) = locked.nodes.get(&node_id) else {
@@ -4797,8 +4798,8 @@ impl Service {
             ));
         };
 
-        if let Some(availability) = &availability {
-            node.set_availability(*availability);
+        if let Some(availability) = availability.as_ref() {
+            node.set_availability(availability.clone());
         }
 
         if let Some(scheduling) = scheduling {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 1fcc3c8547..30723a3b36 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -779,7 +779,7 @@ impl TenantShard {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn optimize_secondary(
         &self,
-        scheduler: &Scheduler,
+        scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
         if self.intent.secondary.is_empty() {
@@ -1595,7 +1595,7 @@ pub(crate) mod tests {
         schedule_context.avoid(&shard_b.intent.all_pageservers());
         schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
 
-        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
+        let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
 
         // Since there is a node with no locations available, the node with two locations for the
         // same tenant should generate an optimization to move one away
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1011a6fd22..bfd82242e9 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -394,6 +394,7 @@ def test_sharding_split_smoke(
 
     # Note which pageservers initially hold a shard after tenant creation
     pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
+    log.info("Pre-split pageservers: {pre_split_pageserver_ids}")
 
     # For pageservers holding a shard, validate their ingest statistics
     # reflect a proper splitting of the WAL.
@@ -555,9 +556,9 @@ def test_sharding_split_smoke(
     assert sum(total.values()) == split_shard_count * 2
     check_effective_tenant_config()
 
-    # More specific check: that we are fully balanced.  This is deterministic because
-    # the order in which we consider shards for optimization is deterministic, and the
-    # order of preference of nodes is also deterministic (lower node IDs win).
+    # More specific check: that we are fully balanced.  It is deterministic that we will get exactly
+    # one shard on each pageserver, because for these small shards the utilization metric is
+    # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
         1: 1,
@@ -577,8 +578,14 @@ def test_sharding_split_smoke(
         15: 1,
         16: 1,
     }
+
+    # The controller is not required to lay out the attached locations in any particular way, but
+    # all the pageservers that originally held an attached shard should still hold one, otherwise
+    # it would indicate that we had done some unnecessary migration.
     log.info(f"attached: {attached}")
-    assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1}
+    for ps_id in pre_split_pageserver_ids:
+        log.info("Pre-split pageserver {ps_id} should still hold an attached location")
+        assert ps_id in attached
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards
     # correctly wrote config to disk, and the storage controller responds correctly

From 0aa14509368d81acf253f406ffafd740bf13b01a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 23 Aug 2024 18:56:05 +0100
Subject: [PATCH 025/159] storage controller: enable timeline CRUD operations
 to run concurrently with reconciliation & make them safer (#8783)

## Problem

- If a reconciler was waiting to be able to notify computes about a
change, but the control plane was waiting for the controller to finish a
timeline creation/deletion, the overall system can deadlock.
- If a tenant shard was migrated concurrently with a timeline
creation/deletion, there was a risk that the timeline operation could be
applied to a non-latest-generation location, and thereby not really be
persistent. This has never happened in practice, but would eventually
happen at scale.

Closes: #8743

## Summary of changes

- Introduce `Service::tenant_remote_mutation` helper, which looks up
shards & generations and passes them into an inner function that may do
remote I/O to pageservers. Before returning success, this helper checks
that generations haven't incremented, to guarantee that changes are
persistent.
- Convert tenant_timeline_create, tenant_timeline_delete, and
tenant_timeline_detach_ancestor to use this helper.
- These functions no longer block on ensure_attached unless the tenant
was never attached at all, so they should make progress even if we can't
complete compute notifications.

This increases the database load from timeline/create operations, but
only with cheap read transactions.
---
 .../down.sql                                  |   2 +
 .../2024-08-23-170149_tenant_id_index/up.sql  |   2 +
 storage_controller/src/persistence.rs         |  38 ++
 storage_controller/src/service.rs             | 480 +++++++++---------
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 .../regress/test_storage_controller.py        |  66 ++-
 6 files changed, 360 insertions(+), 232 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
 create mode 100644 storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql

diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
new file mode 100644
index 0000000000..518c747100
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
@@ -0,0 +1,2 @@
+-- This file should undo anything in `up.sql`
+DROP INDEX tenant_shards_tenant_id;
\ No newline at end of file
diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql
new file mode 100644
index 0000000000..dd6b37781a
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql
@@ -0,0 +1,2 @@
+-- Your SQL goes here
+CREATE INDEX tenant_shards_tenant_id ON tenant_shards (tenant_id);
\ No newline at end of file
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 16df19026c..1a905753a1 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -91,6 +91,7 @@ pub(crate) enum DatabaseOperation {
     Detach,
     ReAttach,
     IncrementGeneration,
+    PeekGenerations,
     ListTenantShards,
     InsertTenantShards,
     UpdateTenantShard,
@@ -502,6 +503,43 @@ impl Persistence {
         Ok(Generation::new(g as u32))
     }
 
+    /// When we want to call out to the running shards for a tenant, e.g. during timeline CRUD operations,
+    /// we need to know where the shard is attached, _and_ the generation, so that we can re-check the generation
+    /// afterwards to confirm that our timeline CRUD operation is truly persistent (it must have happened in the
+    /// latest generation)
+    ///
+    /// If the tenant doesn't exist, an empty vector is returned.
+    ///
+    /// Output is sorted by shard number
+    pub(crate) async fn peek_generations(
+        &self,
+        filter_tenant_id: TenantId,
+    ) -> Result<Vec<(TenantShardId, Option<Generation>, Option<NodeId>)>, DatabaseError> {
+        use crate::schema::tenant_shards::dsl::*;
+        let rows = self
+            .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
+                let result = tenant_shards
+                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                    .select(TenantShardPersistence::as_select())
+                    .order(shard_number)
+                    .load(conn)?;
+                Ok(result)
+            })
+            .await?;
+
+        Ok(rows
+            .into_iter()
+            .map(|p| {
+                (
+                    p.get_tenant_shard_id()
+                        .expect("Corrupt tenant shard id in database"),
+                    p.generation.map(|g| Generation::new(g as u32)),
+                    p.generation_pageserver.map(|n| NodeId(n as u64)),
+                )
+            })
+            .collect())
+    }
+
     #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4b0c556824..7daa1e4f5f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2854,82 +2854,67 @@ impl Service {
         .await;
         failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");
 
-        self.ensure_attached_wait(tenant_id).await?;
+        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+            let shard_zero = targets.remove(0);
 
-        let mut targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
+            async fn create_one(
+                tenant_shard_id: TenantShardId,
+                node: Node,
+                jwt: Option<String>,
+                create_req: TimelineCreateRequest,
+            ) -> Result<TimelineInfo, ApiError> {
+                tracing::info!(
+                    "Creating timeline on shard {}/{}, attached to node {node}",
+                    tenant_shard_id,
+                    create_req.new_timeline_id,
+                );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+                client
+                    .timeline_create(tenant_shard_id, &create_req)
+                    .await
+                    .map_err(|e| passthrough_api_error(&node, e))
             }
-            targets
-        };
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        };
-        let shard_zero = targets.remove(0);
-
-        async fn create_one(
-            tenant_shard_id: TenantShardId,
-            node: Node,
-            jwt: Option<String>,
-            create_req: TimelineCreateRequest,
-        ) -> Result<TimelineInfo, ApiError> {
-            tracing::info!(
-                "Creating timeline on shard {}/{}, attached to node {node}",
-                tenant_shard_id,
-                create_req.new_timeline_id,
-            );
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-
-            client
-                .timeline_create(tenant_shard_id, &create_req)
-                .await
-                .map_err(|e| passthrough_api_error(&node, e))
-        }
-
-        // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
-        // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
-        // that will get the first creation request, and propagate the LSN to all the >0 shards.
-        let timeline_info = create_one(
-            shard_zero.0,
-            shard_zero.1,
-            self.config.jwt_token.clone(),
-            create_req.clone(),
-        )
-        .await?;
-
-        // Propagate the LSN that shard zero picked, if caller didn't provide one
-        if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() {
-            create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
-        }
-
-        // Create timeline on remaining shards with number >0
-        if !targets.is_empty() {
-            // If we had multiple shards, issue requests for the remainder now.
-            let jwt = &self.config.jwt_token;
-            self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
-                let create_req = create_req.clone();
-                Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
-            })
+            // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
+            // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
+            // that will get the first creation request, and propagate the LSN to all the >0 shards.
+            let timeline_info = create_one(
+                shard_zero.0,
+                shard_zero.1,
+                self.config.jwt_token.clone(),
+                create_req.clone(),
+            )
             .await?;
-        }
 
-        Ok(timeline_info)
+            // Propagate the LSN that shard zero picked, if caller didn't provide one
+            if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
+            {
+                create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+            }
+
+            // Create timeline on remaining shards with number >0
+            if !targets.is_empty() {
+                // If we had multiple shards, issue requests for the remainder now.
+                let jwt = &self.config.jwt_token;
+                self.tenant_for_shards(
+                    targets.iter().map(|t| (t.0, t.1.clone())).collect(),
+                    |tenant_shard_id: TenantShardId, node: Node| {
+                        let create_req = create_req.clone();
+                        Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
+                    },
+                )
+                .await?;
+            }
+
+            Ok(timeline_info)
+        })
+        .await?
     }
 
     pub(crate) async fn tenant_timeline_detach_ancestor(
@@ -2946,107 +2931,87 @@ impl Service {
         )
         .await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-            targets
-        };
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        }
+            async fn detach_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+            ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
+                tracing::info!(
+                    "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
 
-        async fn detach_one(
-            tenant_shard_id: TenantShardId,
-            timeline_id: TimelineId,
-            node: Node,
-            jwt: Option<String>,
-        ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
-            tracing::info!(
-                "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
-            );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+                client
+                    .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                    .await
+                    .map_err(|e| {
+                        use mgmt_api::Error;
 
-            client
-                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
-                .await
-                .map_err(|e| {
-                    use mgmt_api::Error;
-
-                    match e {
-                        // no ancestor (ever)
-                        Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
-                            "{node}: {}",
-                            msg.strip_prefix("Conflict: ").unwrap_or(&msg)
-                        )),
-                        // too many ancestors
-                        Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
-                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                        match e {
+                            // no ancestor (ever)
+                            Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
+                                "{node}: {}",
+                                msg.strip_prefix("Conflict: ").unwrap_or(&msg)
+                            )),
+                            // too many ancestors
+                            Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
+                                ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                            }
+                            Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
+                                // avoid turning these into conflicts to remain compatible with
+                                // pageservers, 500 errors are sadly retryable with timeline ancestor
+                                // detach
+                                ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
+                            }
+                            // rest can be mapped as usual
+                            other => passthrough_api_error(&node, other),
                         }
-                        Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
-                            // avoid turning these into conflicts to remain compatible with
-                            // pageservers, 500 errors are sadly retryable with timeline ancestor
-                            // detach
-                            ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
-                        }
-                        // rest can be mapped as usual
-                        other => passthrough_api_error(&node, other),
-                    }
+                    })
+                    .map(|res| (tenant_shard_id.shard_number, res))
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            let mut results = self
+                .tenant_for_shards(targets, |tenant_shard_id, node| {
+                    futures::FutureExt::boxed(detach_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
+                    ))
                 })
-                .map(|res| (tenant_shard_id.shard_number, res))
-        }
+                .await?;
 
-        // no shard needs to go first/last; the operation should be idempotent
-        let mut results = self
-            .tenant_for_shards(targets, |tenant_shard_id, node| {
-                futures::FutureExt::boxed(detach_one(
-                    tenant_shard_id,
-                    timeline_id,
-                    node,
-                    self.config.jwt_token.clone(),
-                ))
-            })
-            .await?;
+            let any = results.pop().expect("we must have at least one response");
 
-        let any = results.pop().expect("we must have at least one response");
+            let mismatching = results
+                .iter()
+                .filter(|(_, res)| res != &any.1)
+                .collect::<Vec<_>>();
+            if !mismatching.is_empty() {
+                // this can be hit by races which should not happen because operation lock on cplane
+                let matching = results.len() - mismatching.len();
+                tracing::error!(
+                    matching,
+                    compared_against=?any,
+                    ?mismatching,
+                    "shards returned different results"
+                );
 
-        let mismatching = results
-            .iter()
-            .filter(|(_, res)| res != &any.1)
-            .collect::<Vec<_>>();
-        if !mismatching.is_empty() {
-            // this can be hit by races which should not happen because operation lock on cplane
-            let matching = results.len() - mismatching.len();
-            tracing::error!(
-                matching,
-                compared_against=?any,
-                ?mismatching,
-                "shards returned different results"
-            );
+                return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
+            }
 
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
-        }
-
-        Ok(any.1)
+            Ok(any.1)
+        }).await?
     }
 
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
@@ -3117,6 +3082,84 @@ impl Service {
         results
     }
 
+    /// Helper for safely working with the shards in a tenant remotely on pageservers, for example
+    /// when creating and deleting timelines:
+    /// - Makes sure shards are attached somewhere if they weren't already
+    /// - Looks up the shards and the nodes where they were most recently attached
+    /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this
+    ///   ensures that the remote operation acted on the most recent generation, and is therefore durable.
+    async fn tenant_remote_mutation<R, O, F>(
+        &self,
+        tenant_id: TenantId,
+        op: O,
+    ) -> Result<R, ApiError>
+    where
+        O: FnOnce(Vec<(TenantShardId, Node)>) -> F,
+        F: std::future::Future<Output = R>,
+    {
+        let target_gens = {
+            let mut targets = Vec::new();
+
+            // Load the currently attached pageservers for the latest generation of each shard.  This can
+            // run concurrently with reconciliations, and it is not guaranteed that the node we find here
+            // will still be the latest when we're done: we will check generations again at the end of
+            // this function to handle that.
+            let generations = self.persistence.peek_generations(tenant_id).await?;
+            let generations = if generations.iter().any(|i| i.1.is_none()) {
+                // One or more shards is not attached to anything: maybe this is a new tenant?  Wait for
+                // it to reconcile.
+                self.ensure_attached_wait(tenant_id).await?;
+                self.persistence.peek_generations(tenant_id).await?
+            } else {
+                generations
+            };
+
+            let locked = self.inner.read().unwrap();
+            for (tenant_shard_id, generation, generation_pageserver) in generations {
+                let node_id = generation_pageserver.ok_or(ApiError::Conflict(
+                    "Tenant not currently attached".to_string(),
+                ))?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .ok_or(ApiError::Conflict(format!(
+                        "Raced with removal of node {node_id}"
+                    )))?;
+                targets.push((tenant_shard_id, node.clone(), generation));
+            }
+
+            targets
+        };
+
+        let targets = target_gens.iter().map(|t| (t.0, t.1.clone())).collect();
+        let result = op(targets).await;
+
+        // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
+        // our remote operation executed on the latest generation and is therefore persistent.
+        {
+            let latest_generations = self.persistence.peek_generations(tenant_id).await?;
+            if latest_generations
+                .into_iter()
+                .map(|g| (g.0, g.1))
+                .collect::<Vec<_>>()
+                != target_gens
+                    .into_iter()
+                    .map(|i| (i.0, i.2))
+                    .collect::<Vec<_>>()
+            {
+                // We raced with something that incremented the generation, and therefore cannot be
+                // confident that our actions are persistent (they might have hit an old generation).
+                //
+                // This is safe but requires a retry: ask the client to do that by giving them a 503 response.
+                return Err(ApiError::ResourceUnavailable(
+                    "Tenant attachment changed, please retry".into(),
+                ));
+            }
+        }
+
+        Ok(result)
+    }
+
     pub(crate) async fn tenant_timeline_delete(
         &self,
         tenant_id: TenantId,
@@ -3130,83 +3173,62 @@ impl Service {
         )
         .await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        let mut targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-            targets
-        };
+            let shard_zero = targets.remove(0);
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        }
-        let shard_zero = targets.remove(0);
+            async fn delete_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+            ) -> Result<StatusCode, ApiError> {
+                tracing::info!(
+                    "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
 
-        async fn delete_one(
-            tenant_shard_id: TenantShardId,
-            timeline_id: TimelineId,
-            node: Node,
-            jwt: Option<String>,
-        ) -> Result<StatusCode, ApiError> {
-            tracing::info!(
-                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
-            );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+                client
+                    .timeline_delete(tenant_shard_id, timeline_id)
+                    .await
+                    .map_err(|e| {
+                        ApiError::InternalServerError(anyhow::anyhow!(
+                            "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                        ))
+                    })
+            }
 
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-            client
-                .timeline_delete(tenant_shard_id, timeline_id)
-                .await
-                .map_err(|e| {
-                    ApiError::InternalServerError(anyhow::anyhow!(
-                        "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+            let statuses = self
+                .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                    Box::pin(delete_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
                     ))
                 })
-        }
+                .await?;
 
-        let statuses = self
-            .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
-                Box::pin(delete_one(
-                    tenant_shard_id,
-                    timeline_id,
-                    node,
-                    self.config.jwt_token.clone(),
-                ))
-            })
+            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+            if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
+                return Ok(StatusCode::ACCEPTED);
+            }
+
+            // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
+            // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
+            let shard_zero_status = delete_one(
+                shard_zero.0,
+                timeline_id,
+                shard_zero.1,
+                self.config.jwt_token.clone(),
+            )
             .await?;
-
-        // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
-        if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
-            return Ok(StatusCode::ACCEPTED);
-        }
-
-        // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
-        // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
-        let shard_zero_status = delete_one(
-            shard_zero.0,
-            timeline_id,
-            shard_zero.1,
-            self.config.jwt_token.clone(),
-        )
-        .await?;
-
-        Ok(shard_zero_status)
+            Ok(shard_zero_status)
+        }).await?
     }
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2bb698f175..92febfec9b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2284,7 +2284,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.allowed_errors,
         )
 
-    def pageserver_api(self) -> PageserverHttpClient:
+    def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient:
         """
         The storage controller implements a subset of the pageserver REST API, for mapping
         per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
@@ -2293,7 +2293,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.port, lambda: True, auth_token)
+        return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b3464b0c91..03eb7628be 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
     TokenScope,
     last_flush_lsn_upload,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     assert_prefix_not_empty,
@@ -41,6 +41,7 @@ from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
 from pytest_httpserver import HTTPServer
+from urllib3 import Retry
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
@@ -2266,3 +2267,66 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
 
     # allow for small delay between actually having cancelled and being able reconfigure again
     wait_until(4, 0.5, reconfigure_node_again)
+
+
+def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
+    """
+    The storage controller is meant to handle the case where a timeline CRUD operation races
+    with a generation-incrementing change to the tenant: this should trigger a retry so that
+    the operation lands on the highest-generation'd tenant location.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id)
+
+    # Set up a failpoint so that a timeline creation will be very slow
+    failpoint = "timeline-creation-after-uninit"
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints((failpoint, "sleep(10000)"))
+
+    # Start a timeline creation in the background
+    create_timeline_id = TimelineId.generate()
+    futs = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
+    ) as executor:
+        futs.append(
+            executor.submit(
+                env.storage_controller.pageserver_api(
+                    retries=Retry(
+                        status=0,
+                        connect=0,  # Disable retries: we want to see the 503
+                    )
+                ).timeline_create,
+                PgVersion.NOT_SET,
+                tenant_id,
+                create_timeline_id,
+            )
+        )
+
+        def has_hit_failpoint():
+            assert any(
+                ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
+            )
+
+        wait_until(10, 1, has_hit_failpoint)
+
+        # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
+        # can detach from the old pageserver, which will happen once the failpoint completes.
+        env.storage_controller.tenant_shard_migrate(
+            TenantShardId(tenant_id, 0, 0), env.pageservers[1].id
+        )
+
+        with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"):
+            futs[0].result(timeout=20)
+
+    # Timeline creation should work when there isn't a concurrent migration, even though it's
+    # slow (our failpoint is still enabled)
+    env.storage_controller.pageserver_api(
+        retries=Retry(
+            status=0,
+            connect=0,  # Disable retries: we want to see the 503
+        )
+    ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)

From 701cb61b572eb0ef3cc29697f86aab36aafbba70 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 19:48:06 +0100
Subject: [PATCH 026/159] proxy: local auth backend (#8806)

Adds a Local authentication backend. Updates http to extract JWT bearer
tokens and passes them to the local backend to validate.
---
 proxy/src/auth/backend.rs             | 34 +++++++++---
 proxy/src/auth/backend/local.rs       | 79 +++++++++++++++++++++++++++
 proxy/src/console/messages.rs         | 22 ++++++++
 proxy/src/console/provider.rs         |  1 +
 proxy/src/proxy/connect_compute.rs    |  7 +--
 proxy/src/serverless.rs               |  2 +-
 proxy/src/serverless/backend.rs       | 71 ++++++++++++++++++------
 proxy/src/serverless/conn_pool.rs     | 12 +++-
 proxy/src/serverless/sql_over_http.rs | 58 +++++++++++++++-----
 9 files changed, 240 insertions(+), 46 deletions(-)
 create mode 100644 proxy/src/auth/backend/local.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 7592d076ec..ae72bc6de3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,6 +2,7 @@ mod classic;
 mod hacks;
 pub mod jwt;
 mod link;
+pub mod local;
 
 use std::net::IpAddr;
 use std::sync::Arc;
@@ -9,6 +10,7 @@ use std::time::Duration;
 
 use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
@@ -68,6 +70,8 @@ pub enum BackendType<'a, T, D> {
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
     Link(MaybeOwned<'a, url::ApiUrl>, D),
+    /// Local proxy uses configured auth credentials and does not wake compute
+    Local(MaybeOwned<'a, LocalBackend>),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -93,6 +97,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
             Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
 }
@@ -104,6 +109,7 @@ impl<T, D> BackendType<'_, T, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
             Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
+            Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
         }
     }
 }
@@ -116,6 +122,7 @@ impl<'a, T, D> BackendType<'a, T, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(c, f(x)),
             Self::Link(c, x) => BackendType::Link(c, x),
+            Self::Local(l) => BackendType::Local(l),
         }
     }
 }
@@ -126,6 +133,7 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
         match self {
             Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
             Self::Link(c, x) => Ok(BackendType::Link(c, x)),
+            Self::Local(l) => Ok(BackendType::Local(l)),
         }
     }
 }
@@ -157,6 +165,7 @@ impl ComputeUserInfo {
 pub enum ComputeCredentialKeys {
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
+    None,
 }
 
 impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
@@ -289,7 +298,7 @@ async fn auth_quirks(
             ctx.set_endpoint_id(res.info.endpoint.clone());
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
-                ComputeCredentialKeys::AuthKeys(_) => {
+                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
                     unreachable!("password hack should return a password")
                 }
             };
@@ -401,6 +410,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         match self {
             Self::Console(_, user_info) => user_info.endpoint_id.clone(),
             Self::Link(_, _) => Some("link".into()),
+            Self::Local(_) => Some("local".into()),
         }
     }
 
@@ -409,6 +419,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         match self {
             Self::Console(_, user_info) => &user_info.user,
             Self::Link(_, _) => "link",
+            Self::Local(_) => "local",
         }
     }
 
@@ -450,6 +461,9 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 
                 BackendType::Link(url, info)
             }
+            Self::Local(_) => {
+                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
+            }
         };
 
         info!("user successfully authenticated");
@@ -465,6 +479,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         match self {
             Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
             Self::Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -475,6 +490,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         match self {
             Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
@@ -488,13 +504,15 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
             Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
-            Self::Console(_, creds) => Some(&creds.keys),
-            Self::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
@@ -508,13 +526,15 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
             Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
-            Self::Console(_, creds) => Some(&creds.keys),
-            Self::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
new file mode 100644
index 0000000000..6d18564dd6
--- /dev/null
+++ b/proxy/src/auth/backend/local.rs
@@ -0,0 +1,79 @@
+use std::{collections::HashMap, net::SocketAddr};
+
+use anyhow::Context;
+use arc_swap::ArcSwapOption;
+
+use crate::{
+    compute::ConnCfg,
+    console::{
+        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
+        NodeInfo,
+    },
+    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
+    RoleName,
+};
+
+use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
+
+pub struct LocalBackend {
+    pub jwks_cache: JwkCache,
+    pub postgres_addr: SocketAddr,
+    pub node_info: NodeInfo,
+}
+
+impl LocalBackend {
+    pub fn new(postgres_addr: SocketAddr) -> Self {
+        LocalBackend {
+            jwks_cache: JwkCache::default(),
+            postgres_addr,
+            node_info: NodeInfo {
+                config: {
+                    let mut cfg = ConnCfg::new();
+                    cfg.host(&postgres_addr.ip().to_string());
+                    cfg.port(postgres_addr.port());
+                    cfg
+                },
+                // TODO(conrad): make this better reflect compute info rather than endpoint info.
+                aux: MetricsAuxInfo {
+                    endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
+                    project_id: ProjectIdTag::get_interner().get_or_intern("local"),
+                    branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    cold_start_info: ColdStartInfo::WarmCached,
+                },
+                allow_self_signed_compute: false,
+            },
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct StaticAuthRules;
+
+pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
+
+#[derive(Debug, Clone)]
+pub struct JwksRoleSettings {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+}
+
+impl FetchAuthRules for StaticAuthRules {
+    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
+        let mappings = JWKS_ROLE_MAP.load();
+        let role_mappings = mappings
+            .as_deref()
+            .and_then(|m| m.roles.get(&role_name))
+            .context("JWKs settings for this role were not configured")?;
+        let mut rules = vec![];
+        for setting in &role_mappings.jwks {
+            rules.push(AuthRule {
+                id: setting.id.clone(),
+                jwks_url: setting.jwks_url.clone(),
+                audience: setting.jwt_audience.clone(),
+            });
+        }
+
+        Ok(rules)
+    }
+}
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index ac66e116d0..a7ccf076b0 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,11 +1,13 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::fmt::{self, Display};
 
 use crate::auth::IpPattern;
 
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::proxy::retry::CouldRetry;
+use crate::RoleName;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -341,6 +343,26 @@ impl ColdStartInfo {
     }
 }
 
+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksRoleMapping {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct EndpointJwksResponse {
+    pub jwks: Vec<JwksSettings>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksSettings {
+    pub id: String,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+    pub jwks_url: url::Url,
+    pub provider_name: String,
+    pub jwt_audience: Option<String>,
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index cc2ee10062..4794527410 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -305,6 +305,7 @@ impl NodeInfo {
         match keys {
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+            ComputeCredentialKeys::None => &mut self.config,
         };
     }
 }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index f38e43ba5a..e1a54a9c98 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -61,7 +61,7 @@ pub trait ComputeConnectBackend {
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
+    fn get_keys(&self) -> &ComputeCredentialKeys;
 }
 
 pub struct TcpMechanism<'a> {
@@ -112,9 +112,8 @@ where
     let mut num_retries = 0;
     let mut node_info =
         wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
-    if let Some(keys) = user_info.get_keys() {
-        node_info.set_keys(keys);
-    }
+
+    node_info.set_keys(user_info.get_keys());
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b2bf93dc6d..ea65867293 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -407,7 +407,7 @@ async fn request_handler(
             .header("Access-Control-Allow-Origin", "*")
             .header(
                 "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
+                "Authorization, Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
             )
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 295ea1a1c7..b44ecb76e3 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -4,7 +4,10 @@ use async_trait::async_trait;
 use tracing::{field::display, info};
 
 use crate::{
-    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
+    auth::{
+        backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo},
+        check_peer_addr_is_in_list, AuthError,
+    },
     compute,
     config::{AuthenticationConfig, ProxyConfig},
     console::{
@@ -24,7 +27,7 @@ use crate::{
     Host,
 };
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
 
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -33,13 +36,14 @@ pub struct PoolingBackend {
 }
 
 impl PoolingBackend {
-    pub async fn authenticate(
+    pub async fn authenticate_with_password(
         &self,
         ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
-        conn_info: &ConnInfo,
+        user_info: &ComputeUserInfo,
+        password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
-        let user_info = conn_info.user_info.clone();
+        let user_info = user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
@@ -47,7 +51,7 @@ impl PoolingBackend {
         }
         if !self
             .endpoint_rate_limiter
-            .check(conn_info.user_info.endpoint.clone().into(), 1)
+            .check(user_info.endpoint.clone().into(), 1)
         {
             return Err(AuthError::too_many_connections());
         }
@@ -70,14 +74,10 @@ impl PoolingBackend {
                 return Err(AuthError::auth_failed(&*user_info.user));
             }
         };
-        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
-        let auth_outcome = crate::auth::validate_password_and_exchange(
-            &config.thread_pool,
-            ep,
-            &conn_info.password,
-            secret,
-        )
-        .await?;
+        let ep = EndpointIdInt::from(&user_info.endpoint);
+        let auth_outcome =
+            crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret)
+                .await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => {
                 info!("user successfully authenticated");
@@ -85,7 +85,7 @@ impl PoolingBackend {
             }
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
-                Err(AuthError::auth_failed(&*conn_info.user_info.user))
+                Err(AuthError::auth_failed(&*user_info.user))
             }
         };
         res.map(|key| ComputeCredentials {
@@ -94,6 +94,39 @@ impl PoolingBackend {
         })
     }
 
+    pub async fn authenticate_with_jwt(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+        jwt: &str,
+    ) -> Result<ComputeCredentials, AuthError> {
+        match &self.config.auth_backend {
+            crate::auth::BackendType::Console(_, _) => {
+                Err(AuthError::auth_failed("JWT login is not yet supported"))
+            }
+            crate::auth::BackendType::Link(_, _) => Err(AuthError::auth_failed(
+                "JWT login over link proxy is not supported",
+            )),
+            crate::auth::BackendType::Local(cache) => {
+                cache
+                    .jwks_cache
+                    .check_jwt(
+                        ctx,
+                        user_info.endpoint.clone(),
+                        user_info.user.clone(),
+                        &StaticAuthRules,
+                        jwt,
+                    )
+                    .await
+                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;
+                Ok(ComputeCredentials {
+                    info: user_info.clone(),
+                    keys: crate::auth::backend::ComputeCredentialKeys::None,
+                })
+            }
+        }
+    }
+
     // Wake up the destination if needed. Code here is a bit involved because
     // we reuse the code from the usual proxy and we need to prepare few structures
     // that this code expects.
@@ -232,10 +265,16 @@ impl ConnectMechanism for TokioMechanism {
         let mut config = (*node_info.config).clone();
         let config = config
             .user(&self.conn_info.user_info.user)
-            .password(&*self.conn_info.password)
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
+        match &self.conn_info.auth {
+            AuthData::Jwt(_) => {}
+            AuthData::Password(pw) => {
+                config.password(pw);
+            }
+        }
+
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 3478787995..6ed694af58 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -33,7 +33,13 @@ use super::backend::HttpConnError;
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
     pub dbname: DbName,
-    pub password: SmallVec<[u8; 16]>,
+    pub auth: AuthData,
+}
+
+#[derive(Debug, Clone)]
+pub enum AuthData {
+    Password(SmallVec<[u8; 16]>),
+    Jwt(String),
 }
 
 impl ConnInfo {
@@ -778,7 +784,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".as_bytes().into(),
+            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -836,7 +842,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".as_bytes().into(),
+            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index bbfed90f39..79baef45f6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -7,6 +7,7 @@ use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
 use futures::TryFutureExt;
+use http::header::AUTHORIZATION;
 use http_body_util::BodyExt;
 use http_body_util::Full;
 use hyper1::body::Body;
@@ -56,6 +57,7 @@ use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
+use super::conn_pool::AuthData;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::http_util::json_response;
@@ -88,6 +90,7 @@ enum Payload {
 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
 
+static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
@@ -109,7 +112,7 @@ where
 #[derive(Debug, thiserror::Error)]
 pub enum ConnInfoError {
     #[error("invalid header: {0}")]
-    InvalidHeader(&'static str),
+    InvalidHeader(&'static HeaderName),
     #[error("invalid connection string: {0}")]
     UrlParseError(#[from] url::ParseError),
     #[error("incorrect scheme")]
@@ -153,10 +156,10 @@ fn get_conn_info(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     let connection_string = headers
-        .get("Neon-Connection-String")
-        .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
+        .get(&CONN_STRING)
+        .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))?
         .to_str()
-        .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?;
+        .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?;
 
     let connection_url = Url::parse(connection_string)?;
 
@@ -179,10 +182,23 @@ fn get_conn_info(
     }
     ctx.set_user(username.clone());
 
-    let password = connection_url
-        .password()
-        .ok_or(ConnInfoError::MissingPassword)?;
-    let password = urlencoding::decode_binary(password.as_bytes());
+    let auth = if let Some(auth) = headers.get(&AUTHORIZATION) {
+        let auth = auth
+            .to_str()
+            .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?;
+        AuthData::Jwt(
+            auth.strip_prefix("Bearer ")
+                .ok_or(ConnInfoError::MissingPassword)?
+                .into(),
+        )
+    } else if let Some(pass) = connection_url.password() {
+        AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) {
+            std::borrow::Cow::Borrowed(b) => b.into(),
+            std::borrow::Cow::Owned(b) => b.into(),
+        })
+    } else {
+        return Err(ConnInfoError::MissingPassword);
+    };
 
     let endpoint = match connection_url.host() {
         Some(url::Host::Domain(hostname)) => {
@@ -225,10 +241,7 @@ fn get_conn_info(
     Ok(ConnInfo {
         user_info,
         dbname,
-        password: match password {
-            std::borrow::Cow::Borrowed(b) => b.into(),
-            std::borrow::Cow::Owned(b) => b.into(),
-        },
+        auth,
     })
 }
 
@@ -550,9 +563,24 @@ async fn handle_inner(
 
     let authenticate_and_connect = Box::pin(
         async {
-            let keys = backend
-                .authenticate(ctx, &config.authentication_config, &conn_info)
-                .await?;
+            let keys = match &conn_info.auth {
+                AuthData::Password(pw) => {
+                    backend
+                        .authenticate_with_password(
+                            ctx,
+                            &config.authentication_config,
+                            &conn_info.user_info,
+                            pw,
+                        )
+                        .await?
+                }
+                AuthData::Jwt(jwt) => {
+                    backend
+                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+                        .await?
+                }
+            };
+
             let client = backend
                 .connect_to_compute(ctx, conn_info, keys, !allow_pool)
                 .await?;

From 06795c6b9a6b4664dadd4c75ccf9f75087b05614 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 22:32:10 +0100
Subject: [PATCH 027/159] proxy: new local-proxy application (#8736)

Add binary for local-proxy that uses the local auth backend. Runs only
the http serverless driver support and offers config reload based on a
config file and SIGHUP
---
 proxy/src/bin/local_proxy.rs   | 316 +++++++++++++++++++++++++++++++++
 proxy/src/bin/pg_sni_router.rs |   4 +-
 proxy/src/bin/proxy.rs         |   7 +-
 proxy/src/lib.rs               |  14 +-
 4 files changed, 335 insertions(+), 6 deletions(-)
 create mode 100644 proxy/src/bin/local_proxy.rs

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
new file mode 100644
index 0000000000..8acba33bac
--- /dev/null
+++ b/proxy/src/bin/local_proxy.rs
@@ -0,0 +1,316 @@
+use std::{
+    net::SocketAddr,
+    path::{Path, PathBuf},
+    pin::pin,
+    sync::Arc,
+    time::Duration,
+};
+
+use anyhow::{bail, ensure};
+use dashmap::DashMap;
+use futures::{future::Either, FutureExt};
+use proxy::{
+    auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
+    cancellation::CancellationHandlerMain,
+    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
+    console::{locks::ApiLocks, messages::JwksRoleMapping},
+    http::health_server::AppMetrics,
+    metrics::{Metrics, ThreadPoolMetrics},
+    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
+    scram::threadpool::ThreadPool,
+    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
+};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+use clap::Parser;
+use tokio::{net::TcpListener, task::JoinSet};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+/// Neon proxy/router
+#[derive(Parser)]
+#[command(version = GIT_VERSION, about)]
+struct LocalProxyCliArgs {
+    /// listen for incoming metrics connections on ip:port
+    #[clap(long, default_value = "127.0.0.1:7001")]
+    metrics: String,
+    /// listen for incoming http connections on ip:port
+    #[clap(long)]
+    http: String,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
+    /// User rate limiter max number of requests per second.
+    ///
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    user_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Address of the postgres server
+    #[clap(long, default_value = "127.0.0.1:5432")]
+    compute: SocketAddr,
+    /// File address of the local proxy config file
+    #[clap(long, default_value = "./localproxy.json")]
+    config_path: PathBuf,
+}
+
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 200)]
+    sql_over_http_pool_max_total_conns: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    #[clap(long, default_value_t = 100)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 16)]
+    sql_over_http_cancel_set_shards: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
+    info!("Version: {GIT_VERSION}");
+    info!("Build_tag: {BUILD_TAG}");
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
+
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
+        }
+    };
+
+    let args = LocalProxyCliArgs::parse();
+    let config = build_config(&args)?;
+
+    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
+    let http_listener = TcpListener::bind(args.http).await?;
+    let shutdown = CancellationToken::new();
+
+    // todo: should scale with CU
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+        LeakyBucketConfig {
+            rps: 10.0,
+            max: 100.0,
+        },
+        16,
+    ));
+
+    refresh_config(args.config_path.clone()).await;
+
+    let mut maintenance_tasks = JoinSet::new();
+    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
+        refresh_config(args.config_path.clone()).map(Ok)
+    }));
+    maintenance_tasks.spawn(proxy::http::health_server::task_main(
+        metrics_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
+
+    let task = serverless::task_main(
+        config,
+        http_listener,
+        shutdown.clone(),
+        Arc::new(CancellationHandlerMain::new(
+            Arc::new(DashMap::new()),
+            None,
+            proxy::metrics::CancellationSource::Local,
+        )),
+        endpoint_rate_limiter,
+    );
+
+    match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
+        // exit immediately on maintenance task completion
+        Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
+        // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
+        Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
+        // exit immediately on client task error
+        Either::Right((res, _)) => res?,
+    }
+
+    Ok(())
+}
+
+/// ProxyConfig is created at proxy startup, and lives forever.
+fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let config::ConcurrencyLockOptions {
+        shards,
+        limiter,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
+    let connect_compute_locks = ApiLocks::new(
+        "connect_compute_lock",
+        limiter,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    )?;
+
+    let http_config = HttpConfig {
+        accept_websockets: false,
+        pool_options: GlobalConnPoolOptions {
+            gc_epoch: Duration::from_secs(60),
+            pool_shards: 2,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: false,
+
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
+        },
+        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
+        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+    };
+
+    Ok(Box::leak(Box::new(ProxyConfig {
+        tls_config: None,
+        auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
+            LocalBackend::new(args.compute),
+        )),
+        metric_collection: None,
+        allow_self_signed_compute: false,
+        http_config,
+        authentication_config: AuthenticationConfig {
+            thread_pool: ThreadPool::new(0),
+            scram_protocol_timeout: Duration::from_secs(10),
+            rate_limiter_enabled: false,
+            rate_limiter: BucketRateLimiter::new(vec![]),
+            rate_limit_ip_subnet: 64,
+        },
+        require_client_ip: false,
+        handshake_timeout: Duration::from_secs(10),
+        region: "local".into(),
+        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
+        connect_compute_locks,
+        connect_to_compute_retry_config: RetryConfig::parse(
+            RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES,
+        )?,
+    })))
+}
+
+async fn refresh_config(path: PathBuf) {
+    match refresh_config_inner(&path).await {
+        Ok(()) => {}
+        Err(e) => {
+            error!(error=?e, ?path, "could not read config file");
+        }
+    }
+}
+
+async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
+    let bytes = tokio::fs::read(&path).await?;
+    let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
+
+    let mut settings = None;
+
+    for mapping in data.roles.values_mut() {
+        for jwks in &mut mapping.jwks {
+            ensure!(
+                jwks.jwks_url.has_authority()
+                    && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
+                "Invalid JWKS url. Must be HTTP",
+            );
+
+            ensure!(
+                jwks.jwks_url
+                    .host()
+                    .is_some_and(|h| h != url::Host::Domain("")),
+                "Invalid JWKS url. No domain listed",
+            );
+
+            // clear username, password and ports
+            jwks.jwks_url.set_username("").expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            jwks.jwks_url.set_password(None).expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            // local testing is hard if we need to have a specific restricted port
+            if cfg!(not(feature = "testing")) {
+                jwks.jwks_url.set_port(None).expect(
+                    "url can be a base and has a valid host and is not a file. should not error",
+                );
+            }
+
+            // clear query params
+            jwks.jwks_url.set_fragment(None);
+            jwks.jwks_url.query_pairs_mut().clear().finish();
+
+            if jwks.jwks_url.scheme() != "https" {
+                // local testing is hard if we need to set up https support.
+                if cfg!(not(feature = "testing")) {
+                    jwks.jwks_url
+                        .set_scheme("https")
+                        .expect("should not error to set the scheme to https if it was http");
+                } else {
+                    warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
+                }
+            }
+
+            let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
+            ensure!(
+                *pr == jwks.project_id,
+                "inconsistent project IDs configured"
+            );
+            ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
+        }
+    }
+
+    if let Some((project_id, branch_id)) = settings {
+        JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
+            roles: data.roles,
+            project_id,
+            branch_id,
+        })));
+    }
+
+    Ok(())
+}
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 1038fa5116..20d2d3df9a 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,7 +133,9 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
     ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));
+    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
+        Ok(())
+    }));
 
     // the signal task cant ever succeed.
     // the main task can error, or can succeed on cancellation.
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index d83a1f3bcf..1f45a33ed5 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -148,7 +148,7 @@ struct ProxyCliArgs {
     disable_dynamic_rate_limiter: bool,
     /// Endpoint rate limiter max number of requests per second.
     ///
-    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
@@ -447,7 +447,10 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
+    maintenance_tasks.spawn(proxy::handle_signals(
+        cancellation_token.clone(),
+        || async { Ok(()) },
+    ));
     maintenance_tasks.spawn(http::health_server::task_main(
         http_listener,
         AppMetrics {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index b7d497ebcc..8e1a4e4fa2 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -88,7 +88,7 @@
 // List of temporarily allowed lints to unblock beta/nightly.
 #![allow(unknown_lints, clippy::manual_inspect)]
 
-use std::convert::Infallible;
+use std::{convert::Infallible, future::Future};
 
 use anyhow::{bail, Context};
 use intern::{EndpointIdInt, EndpointIdTag, InternId};
@@ -123,7 +123,14 @@ pub mod usage_metrics;
 pub mod waiters;
 
 /// Handle unix signals appropriately.
-pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {
+pub async fn handle_signals<F, Fut>(
+    token: CancellationToken,
+    mut refresh_config: F,
+) -> anyhow::Result<Infallible>
+where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = anyhow::Result<()>>,
+{
     use tokio::signal::unix::{signal, SignalKind};
 
     let mut hangup = signal(SignalKind::hangup())?;
@@ -134,7 +141,8 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
         tokio::select! {
             // Hangup is commonly used for config reload.
             _ = hangup.recv() => {
-                warn!("received SIGHUP; config reload is not supported");
+                warn!("received SIGHUP");
+                refresh_config().await?;
             }
             // Shut down the whole application.
             _ = interrupt.recv() => {

From cdfdcd3e5d665c6f8093623e7323cef3d58aa308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:33:45 +0200
Subject: [PATCH 028/159] chore: improve markdown formatting (#8825)

fixes:

![Screenshot_2024-08-25_16-25-30](https://github.com/user-attachments/assets/c993309b-6c2d-4938-9fd0-ce0953fc63ff)

fixes:

![Screenshot_2024-08-25_16-26-29](https://github.com/user-attachments/assets/cf497f4a-d9e3-45a6-a1a5-7e215d96d022)
---
 proxy/README.md            | 5 +++--
 storage_scrubber/README.md | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index d1f2e3f27b..afc8b77db8 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation
 ```
 
 If both postgres and proxy are running you may send a SQL query:
-```json
+```console
 curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
   -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
   -H 'Content-Type: application/json' \
@@ -44,7 +44,8 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
     "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
     "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
   }' | jq
-
+```
+```json
 {
   "command": "SELECT",
   "fields": [
diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md
index 9fbd92feef..5be8541419 100644
--- a/storage_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -98,7 +98,7 @@ to list timelines and find their backup and start LSNs.
 
 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
 
-First, we need to group pageservers by buckets, `https://<admin host>/admin/pageservers`` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed.
+First, we need to group pageservers by buckets, `https://<admin host>/admin/pageservers` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed.
 
 Per bucket, for every pageserver id related, find deleted tenants:
 

From d48229f50f9253026267083f9b31513754d03365 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 26 Aug 2024 14:34:18 +0200
Subject: [PATCH 029/159] feat(compute): Introduce new
 compute_subscriptions_count metric (#8796)

## Problem

We need some metric to sneak peek into how many people use inbound
logical replication (Neon is a subscriber).

## Summary of changes

This commit adds a new metric `compute_subscriptions_count`, which is
number of subscriptions grouped by enabled/disabled state.

Resolves: neondatabase/cloud#16146
---
 vm-image-spec.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 622004b931..0bacb63509 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -356,6 +356,17 @@ files:
           from pg_replication_slots
           where slot_type = 'logical';
 
+      - metric_name: compute_subscriptions_count
+        type: gauge
+        help: 'Number of logical replication subscriptions grouped by enabled/disabled'
+        key_labels:
+          - enabled
+        values: [subscriptions_count]
+        query: |
+          select subenabled::text as enabled, count(*) as subscriptions_count
+          from pg_subscription
+          group by subenabled;
+
       - metric_name: retained_wal
         type: gauge
         help: 'Retained WAL in inactive replication slots'

From d6eede515a18283b3110ad98ab5dd50f029a38e2 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 26 Aug 2024 15:16:54 +0200
Subject: [PATCH 030/159] proxy: clippy lints: handle some low hanging fruit
 (#8829)

Should be mostly uncontroversial ones.
---
 proxy/src/auth/backend.rs                 | 37 ++++++++++----------
 proxy/src/auth/backend/jwt.rs             | 17 ++++------
 proxy/src/auth/credentials.rs             |  9 +++--
 proxy/src/cache/project_info.rs           |  8 ++---
 proxy/src/cancellation.rs                 |  3 +-
 proxy/src/compute.rs                      |  2 +-
 proxy/src/config.rs                       |  2 +-
 proxy/src/console/provider/mock.rs        | 22 +++++-------
 proxy/src/console/provider/neon.rs        | 10 +++---
 proxy/src/lib.rs                          | 31 ++++++-----------
 proxy/src/proxy.rs                        |  6 ++--
 proxy/src/proxy/connect_compute.rs        |  7 ++--
 proxy/src/proxy/copy_bidirectional.rs     |  7 ++--
 proxy/src/proxy/tests.rs                  |  6 ++--
 proxy/src/proxy/tests/mitm.rs             |  4 +--
 proxy/src/rate_limiter/leaky_bucket.rs    |  1 +
 proxy/src/rate_limiter/limit_algorithm.rs |  3 +-
 proxy/src/redis/notifications.rs          |  2 +-
 proxy/src/scram/messages.rs               |  2 +-
 proxy/src/scram/secret.rs                 |  8 +----
 proxy/src/scram/threadpool.rs             |  7 ++--
 proxy/src/serverless.rs                   |  4 +--
 proxy/src/serverless/backend.rs           | 20 ++++++-----
 proxy/src/serverless/conn_pool.rs         | 41 ++++++++++++-----------
 proxy/src/serverless/json.rs              |  2 +-
 proxy/src/serverless/sql_over_http.rs     |  4 +--
 proxy/src/stream.rs                       |  2 +-
 proxy/src/usage_metrics.rs                |  9 ++---
 proxy/src/waiters.rs                      |  2 +-
 29 files changed, 128 insertions(+), 150 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index ae72bc6de3..bb9a0ddffc 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -85,7 +85,7 @@ pub trait TestBackend: Send + Sync + 'static {
 impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            Self::Console(api, _) => match &**api {
+            Self::Console(api, ()) => match &**api {
                 ConsoleBackend::Console(endpoint) => {
                     fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                 }
@@ -96,7 +96,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Link(url, ()) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
             Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
@@ -324,21 +324,20 @@ async fn auth_quirks(
     };
     let (cached_entry, secret) = cached_secret.take_value();
 
-    let secret = match secret {
-        Some(secret) => config.check_rate_limit(
+    let secret = if let Some(secret) = secret {
+        config.check_rate_limit(
             ctx,
             config,
             secret,
             &info.endpoint,
             unauthenticated_password.is_some() || allow_cleartext,
-        )?,
-        None => {
-            // If we don't have an authentication secret, we mock one to
-            // prevent malicious probing (possible due to missing protocol steps).
-            // This mocked secret will never lead to successful authentication.
-            info!("authentication info not found, mocking it");
-            AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
-        }
+        )?
+    } else {
+        // If we don't have an authentication secret, we mock one to
+        // prevent malicious probing (possible due to missing protocol steps).
+        // This mocked secret will never lead to successful authentication.
+        info!("authentication info not found, mocking it");
+        AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
     };
 
     match authenticate_with_secret(
@@ -409,7 +408,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     pub fn get_endpoint(&self) -> Option<EndpointId> {
         match self {
             Self::Console(_, user_info) => user_info.endpoint_id.clone(),
-            Self::Link(_, _) => Some("link".into()),
+            Self::Link(_, ()) => Some("link".into()),
             Self::Local(_) => Some("local".into()),
         }
     }
@@ -418,7 +417,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     pub fn get_user(&self) -> &str {
         match self {
             Self::Console(_, user_info) => &user_info.user,
-            Self::Link(_, _) => "link",
+            Self::Link(_, ()) => "link",
             Self::Local(_) => "local",
         }
     }
@@ -454,7 +453,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Self::Link(url, _) => {
+            Self::Link(url, ()) => {
                 info!("performing link authentication");
 
                 let info = link::authenticate(ctx, &url, client).await?;
@@ -478,7 +477,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Self::Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Link(_, ()) => Ok(Cached::new_uncached(None)),
             Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
@@ -489,7 +488,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Link(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
             Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
@@ -525,7 +524,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Link(_, ()) => unreachable!("link auth flow doesn't support waking the compute"),
             Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
@@ -533,7 +532,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
             Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Link(_, ()) => &ComputeCredentialKeys::None,
             Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 49d5de16c3..61833e19ed 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -224,10 +224,10 @@ impl JwkCacheEntryLock {
         // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
 
         let (header_payload, signature) = jwt
-            .rsplit_once(".")
+            .rsplit_once('.')
             .context("Provided authentication token is not a valid JWT encoding")?;
         let (header, payload) = header_payload
-            .split_once(".")
+            .split_once('.')
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
@@ -320,14 +320,11 @@ impl JwkCache {
         // try with just a read lock first
         let key = (endpoint, role_name.clone());
         let entry = self.map.get(&key).as_deref().map(Arc::clone);
-        let entry = match entry {
-            Some(entry) => entry,
-            None => {
-                // acquire a write lock after to insert.
-                let entry = self.map.entry(key).or_default();
-                Arc::clone(&*entry)
-            }
-        };
+        let entry = entry.unwrap_or_else(|| {
+            // acquire a write lock after to insert.
+            let entry = self.map.entry(key).or_default();
+            Arc::clone(&*entry)
+        });
 
         entry
             .check_jwt(ctx, jwt, &self.client, role_name, fetch)
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 849e7d65e8..cb06fcaf55 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -130,9 +130,12 @@ impl ComputeUserInfoMaybeEndpoint {
                 }))
             }
             // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
-                true => Ok(name),
+            (a, b) => a.or(b).map(|name| {
+                if project_name_valid(name.as_ref()) {
+                    Ok(name)
+                } else {
+                    Err(ComputeUserInfoParseError::MalformedProjectName(name))
+                }
             }),
         }
         .transpose()?;
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 10cc4ceee1..eda886a7af 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -274,13 +274,13 @@ impl ProjectInfoCacheImpl {
         let ttl_disabled_since_us = self
             .ttl_disabled_since_us
             .load(std::sync::atomic::Ordering::Relaxed);
-        let ignore_cache_since = if ttl_disabled_since_us != u64::MAX {
+        let ignore_cache_since = if ttl_disabled_since_us == u64::MAX {
+            None
+        } else {
             let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us);
             // We are fine if entry is not older than ttl or was added before we are getting notifications.
             valid_since = valid_since.min(ignore_cache_since);
             Some(ignore_cache_since)
-        } else {
-            None
         };
         (valid_since, ignore_cache_since)
     }
@@ -306,7 +306,7 @@ impl ProjectInfoCacheImpl {
         let mut removed = 0;
         let shard = self.project2ep.shards()[shard].write();
         for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get().iter() {
+            for endpoint in endpoints.get() {
                 self.cache.remove(endpoint);
                 removed += 1;
             }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 34512e9f5b..ea8f7b4070 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -220,7 +220,8 @@ mod tests {
 
     #[tokio::test]
     async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
+        let handler =
+            CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local);
         handler
             .cancel_session(
                 CancelKeyData {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index c071a59d58..b6659f5dd0 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -286,7 +286,7 @@ impl ConnCfg {
 
         let client_config = if allow_self_signed_compute {
             // Allow all certificates for creating the connection
-            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
+            let verifier = Arc::new(AcceptEverythingVerifier);
             rustls::ClientConfig::builder()
                 .dangerous()
                 .with_custom_certificate_verifier(verifier)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index a280aa88ce..6c42fb8d19 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -318,7 +318,7 @@ impl CertResolver {
             // a) Instead of multi-cert approach use single cert with extra
             //    domains listed in Subject Alternative Name (SAN).
             // b) Deploy separate proxy instances for extra domains.
-            self.default.as_ref().cloned()
+            self.default.clone()
         }
     }
 }
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 2093da7562..4e8b7a9365 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -64,7 +64,7 @@ impl Api {
                 tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
             tokio::spawn(connection);
-            let secret = match get_execute_postgres_query(
+            let secret = if let Some(entry) = get_execute_postgres_query(
                 &client,
                 "select rolpassword from pg_catalog.pg_authid where rolname = $1",
                 &[&&*user_info.user],
@@ -72,15 +72,12 @@ impl Api {
             )
             .await?
             {
-                Some(entry) => {
-                    info!("got a secret: {entry}"); // safe since it's not a prod scenario
-                    let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
-                    secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
-                }
-                None => {
-                    warn!("user '{}' does not exist", user_info.user);
-                    None
-                }
+                info!("got a secret: {entry}"); // safe since it's not a prod scenario
+                let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
+                secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
+            } else {
+                warn!("user '{}' does not exist", user_info.user);
+                None
             };
             let allowed_ips = match get_execute_postgres_query(
                 &client,
@@ -142,12 +139,11 @@ async fn get_execute_postgres_query(
     let rows = client.query(query, params).await?;
 
     // We can get at most one row, because `rolname` is unique.
-    let row = match rows.first() {
-        Some(row) => row,
+    let Some(row) = rows.first() else {
         // This means that the user doesn't exist, so there can be no secret.
         // However, this is still a *valid* outcome which is very similar
         // to getting `404 Not found` from the Neon console.
-        None => return Ok(None),
+        return Ok(None);
     };
 
     let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?;
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 7eda238b66..a6c0e233fc 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,9 +38,9 @@ impl Api {
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
+        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
-            Err(_) => "".to_string(),
+            Err(_) => String::new(),
         };
         Self {
             endpoint,
@@ -96,10 +96,10 @@ impl Api {
                 // Error 404 is special: it's ok not to have a secret.
                 // TODO(anna): retry
                 Err(e) => {
-                    if e.get_reason().is_not_found() {
-                        return Ok(AuthInfo::default());
+                    return if e.get_reason().is_not_found() {
+                        Ok(AuthInfo::default())
                     } else {
-                        return Err(e.into());
+                        Err(e.into())
                     }
                 }
             };
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 8e1a4e4fa2..1e14ca59ec 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -12,6 +12,8 @@
 // https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
 #![warn(
     clippy::undocumented_unsafe_blocks,
+    // TODO: Enable once all individual checks are enabled.
+    //clippy::as_conversions,
     clippy::dbg_macro,
     clippy::empty_enum_variants_with_brackets,
     clippy::exit,
@@ -31,8 +33,15 @@
 )]
 // List of permanently allowed lints.
 #![allow(
-    // It's ok to cast u8 to bool, etc.
+    // It's ok to cast bool to u8, etc.
     clippy::cast_lossless,
+    // Seems unavoidable.
+    clippy::multiple_crate_versions,
+    // While #[must_use] is a great feature this check is too noisy.
+    clippy::must_use_candidate,
+    // Inline consts, structs, fns, imports, etc. are ok if they're used by
+    // the following statement(s).
+    clippy::items_after_statements,
 )]
 // List of temporarily allowed lints.
 // TODO: Switch to except() once stable with 1.81.
@@ -43,46 +52,26 @@
     clippy::cast_possible_wrap,
     clippy::cast_precision_loss,
     clippy::cast_sign_loss,
-    clippy::default_trait_access,
     clippy::doc_markdown,
-    clippy::explicit_iter_loop,
-    clippy::float_cmp,
-    clippy::if_not_else,
-    clippy::ignored_unit_patterns,
     clippy::implicit_hasher,
-    clippy::inconsistent_struct_constructor,
     clippy::inline_always,
-    clippy::items_after_statements,
-    clippy::manual_assert,
-    clippy::manual_let_else,
-    clippy::manual_string_new,
-    clippy::match_bool,
     clippy::match_same_arms,
     clippy::match_wild_err_arm,
     clippy::missing_errors_doc,
     clippy::missing_panics_doc,
     clippy::module_name_repetitions,
-    clippy::multiple_crate_versions,
-    clippy::must_use_candidate,
-    clippy::needless_for_each,
     clippy::needless_pass_by_value,
     clippy::needless_raw_string_hashes,
-    clippy::option_as_ref_cloned,
     clippy::redundant_closure_for_method_calls,
-    clippy::redundant_else,
     clippy::return_self_not_must_use,
     clippy::similar_names,
-    clippy::single_char_pattern,
     clippy::single_match_else,
     clippy::struct_excessive_bools,
     clippy::struct_field_names,
     clippy::too_many_lines,
-    clippy::uninlined_format_args,
-    clippy::unnested_or_patterns,
     clippy::unreadable_literal,
     clippy::unused_async,
     clippy::unused_self,
-    clippy::used_underscore_binding,
     clippy::wildcard_imports
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2182f38fe7..aa1025a29f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -254,7 +254,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol();
-    let _request_gauge = metrics.connection_requests.guard(proto);
+    let request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
@@ -283,7 +283,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let result = config
         .auth_backend
         .as_ref()
-        .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
+        .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
         .transpose();
 
     let user_info = match result {
@@ -340,7 +340,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         client: stream,
         aux: node.aux.clone(),
         compute: node,
-        req: _request_gauge,
+        req: request_gauge,
         conn: conn_gauge,
         cancel: session,
     }))
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index e1a54a9c98..6305dc204e 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -30,9 +30,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
     }
-    let label = match is_cached {
-        true => ConnectionFailureKind::ComputeCached,
-        false => ConnectionFailureKind::ComputeUncached,
+    let label = if is_cached {
+        ConnectionFailureKind::ComputeCached
+    } else {
+        ConnectionFailureKind::ComputeUncached
     };
     Metrics::get().proxy.connection_failures_total.inc(label);
 
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 048523f69c..f8c8e8bc4b 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -230,11 +230,10 @@ impl CopyBuffer {
                         io::ErrorKind::WriteZero,
                         "write zero byte into writer",
                     ))));
-                } else {
-                    self.pos += i;
-                    self.amt += i as u64;
-                    self.need_flush = true;
                 }
+                self.pos += i;
+                self.amt += i as u64;
+                self.need_flush = true;
             }
 
             // If pos larger than cap, this loop will never stop.
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index d8308c4f2a..21c0641a7f 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -433,7 +433,7 @@ impl ReportableError for TestConnectError {
 
 impl std::fmt::Display for TestConnectError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
+        write!(f, "{self:?}")
     }
 }
 
@@ -475,7 +475,7 @@ impl ConnectMechanism for TestConnectMechanism {
                 retryable: false,
                 kind: ErrorKind::Compute,
             }),
-            x => panic!("expecting action {:?}, connect is called instead", x),
+            x => panic!("expecting action {x:?}, connect is called instead"),
         }
     }
 
@@ -515,7 +515,7 @@ impl TestBackend for TestConnectMechanism {
                 assert!(err.could_retry());
                 Err(console::errors::WakeComputeError::ApiError(err))
             }
-            x => panic!("expecting action {:?}, wake_compute is called instead", x),
+            x => panic!("expecting action {x:?}, wake_compute is called instead"),
         }
     }
 
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 2d752b9183..71f07f4682 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -115,9 +115,7 @@ where
     let mut buf = [0];
     stream.read_exact(&mut buf).await.unwrap();
 
-    if buf[0] != b'S' {
-        panic!("ssl not supported by server");
-    }
+    assert!(buf[0] == b'S', "ssl not supported by server");
 
     tls.connect(stream).await.unwrap()
 }
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index 2d5e056540..f184e18f4c 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -119,6 +119,7 @@ impl Default for LeakyBucketState {
 }
 
 #[cfg(test)]
+#[allow(clippy::float_cmp)]
 mod tests {
     use std::time::Duration;
 
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 80a62b2a76..bc16837f65 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -174,9 +174,8 @@ impl DynamicLimiter {
                     let mut inner = self.inner.lock();
                     if inner.take(&self.ready).is_some() {
                         break Ok(Token::new(self.clone()));
-                    } else {
-                        notified.set(self.ready.notified());
                     }
+                    notified.set(self.ready.notified());
                 }
                 notified.as_mut().await;
                 ready = true;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index ad69246443..31c0e62c2c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -150,7 +150,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     }
                 }
             }
-            _ => {
+            Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => {
                 invalidate_cache(self.cache.clone(), msg.clone());
                 if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                     Metrics::get()
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 5ecbbf7004..54157e450d 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -89,7 +89,7 @@ impl<'a> ClientFirstMessage<'a> {
         write!(&mut message, "r={}", self.nonce).unwrap();
         base64::encode_config_buf(nonce, base64::STANDARD, &mut message);
         let combined_nonce = 2..message.len();
-        write!(&mut message, ",s={},i={}", salt_base64, iterations).unwrap();
+        write!(&mut message, ",s={salt_base64},i={iterations}").unwrap();
 
         // This design guarantees that it's impossible to create a
         // server-first-message without receiving a client-first-message
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 44c4f9e44a..a08cb943c3 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -82,13 +82,7 @@ mod tests {
         let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns=";
         let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI=";
 
-        let secret = format!(
-            "SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",
-            iterations = iterations,
-            salt = salt,
-            stored_key = stored_key,
-            server_key = server_key,
-        );
+        let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}");
 
         let parsed = ServerSecret::parse(&secret).unwrap();
         assert_eq!(parsed.iterations, iterations);
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index fa3d3ccca2..8fbaecf93d 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -222,12 +222,11 @@ fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
         }
 
         for i in 0.. {
-            let mut job = match worker
+            let Some(mut job) = worker
                 .pop()
                 .or_else(|| pool.steal(&mut rng, index, &worker))
-            {
-                Some(job) => job,
-                None => continue 'wait,
+            else {
+                continue 'wait;
             };
 
             pool.metrics
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index ea65867293..d9a9019746 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -93,11 +93,11 @@ pub async fn task_main(
             let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
             // prefer http2, but support http/1.1
             tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-            Arc::new(tls_server_config) as Arc<_>
+            Arc::new(tls_server_config)
         }
         None => {
             warn!("TLS config is missing");
-            Arc::new(NoTls) as Arc<_>
+            Arc::new(NoTls)
         }
     };
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index b44ecb76e3..9cc271c588 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -44,7 +44,11 @@ impl PoolingBackend {
         password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = user_info.clone();
-        let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
+        let backend = self
+            .config
+            .auth_backend
+            .as_ref()
+            .map(|()| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
@@ -101,10 +105,10 @@ impl PoolingBackend {
         jwt: &str,
     ) -> Result<ComputeCredentials, AuthError> {
         match &self.config.auth_backend {
-            crate::auth::BackendType::Console(_, _) => {
+            crate::auth::BackendType::Console(_, ()) => {
                 Err(AuthError::auth_failed("JWT login is not yet supported"))
             }
-            crate::auth::BackendType::Link(_, _) => Err(AuthError::auth_failed(
+            crate::auth::BackendType::Link(_, ()) => Err(AuthError::auth_failed(
                 "JWT login over link proxy is not supported",
             )),
             crate::auth::BackendType::Local(cache) => {
@@ -138,12 +142,12 @@ impl PoolingBackend {
         keys: ComputeCredentials,
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
-        let maybe_client = if !force_new {
-            info!("pool: looking for an existing connection");
-            self.pool.get(ctx, &conn_info)?
-        } else {
+        let maybe_client = if force_new {
             info!("pool: pool is disabled");
             None
+        } else {
+            info!("pool: looking for an existing connection");
+            self.pool.get(ctx, &conn_info)?
         };
 
         if let Some(client) = maybe_client {
@@ -152,7 +156,7 @@ impl PoolingBackend {
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
         info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        let backend = self.config.auth_backend.as_ref().map(|_| keys);
+        let backend = self.config.auth_backend.as_ref().map(|()| keys);
         crate::proxy::connect_compute::connect_to_compute(
             ctx,
             &TokioMechanism {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 6ed694af58..476083d71e 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -339,9 +339,9 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 } = pool.get_mut();
 
                 // ensure that closed clients are removed
-                pools.iter_mut().for_each(|(_, db_pool)| {
+                for db_pool in pools.values_mut() {
                     clients_removed += db_pool.clear_closed_clients(total_conns);
-                });
+                }
 
                 // we only remove this pool if it has no active connections
                 if *total_conns == 0 {
@@ -405,21 +405,20 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             if client.is_closed() {
                 info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                 return Ok(None);
-            } else {
-                tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
-                tracing::Span::current().record(
-                    "pid",
-                    tracing::field::display(client.inner.get_process_id()),
-                );
-                info!(
-                    cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
-                    "pool: reusing connection '{conn_info}'"
-                );
-                client.session.send(ctx.session_id())?;
-                ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-                ctx.success();
-                return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
+            tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
+            tracing::Span::current().record(
+                "pid",
+                tracing::field::display(client.inner.get_process_id()),
+            );
+            info!(
+                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                "pool: reusing connection '{conn_info}'"
+            );
+            client.session.send(ctx.session_id())?;
+            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+            ctx.success();
+            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
         }
         Ok(None)
     }
@@ -660,7 +659,7 @@ impl<C: ClientInnerExt> Client<C> {
             span: _,
         } = self;
         let inner = inner.as_mut().expect("client inner should not be removed");
-        (&mut inner.inner, Discard { pool, conn_info })
+        (&mut inner.inner, Discard { conn_info, pool })
     }
 }
 
@@ -722,7 +721,9 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 mod tests {
     use std::{mem, sync::atomic::AtomicBool};
 
-    use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId};
+    use crate::{
+        proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId,
+    };
 
     use super::*;
 
@@ -781,7 +782,7 @@ mod tests {
             user_info: ComputeUserInfo {
                 user: "user".into(),
                 endpoint: "endpoint".into(),
-                options: Default::default(),
+                options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
             auth: AuthData::Password("password".as_bytes().into()),
@@ -839,7 +840,7 @@ mod tests {
             user_info: ComputeUserInfo {
                 user: "user".into(),
                 endpoint: "endpoint-2".into(),
-                options: Default::default(),
+                options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
             auth: AuthData::Password("password".as_bytes().into()),
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index c22c63e85b..3776971fa1 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -55,7 +55,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
                 .collect::<Vec<_>>()
                 .join(",");
 
-            Some(format!("{{{}}}", vals))
+            Some(format!("{{{vals}}}"))
         }
     }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 79baef45f6..9143469eea 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -207,12 +207,12 @@ fn get_conn_info(
                     .ok_or(ConnInfoError::MalformedEndpoint)?
             } else {
                 hostname
-                    .split_once(".")
+                    .split_once('.')
                     .map_or(hostname, |(prefix, _)| prefix)
                     .into()
             }
         }
-        Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
+        Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => {
             return Err(ConnInfoError::MissingHostname)
         }
     };
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 7809d2e574..ef13f5fc1a 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -67,7 +67,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
             FeMessage::PasswordMessage(msg) => Ok(msg),
             bad => Err(io::Error::new(
                 io::ErrorKind::InvalidData,
-                format!("unexpected message type: {:?}", bad),
+                format!("unexpected message type: {bad:?}"),
             )),
         }
     }
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index a8735fe0bb..4cf6da7e2d 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -450,12 +450,9 @@ async fn upload_events_chunk(
     remote_path: &RemotePath,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let storage = match storage {
-        Some(storage) => storage,
-        None => {
-            error!("no remote storage configured");
-            return Ok(());
-        }
+    let Some(storage) = storage else {
+        error!("no remote storage configured");
+        return Ok(());
     };
     let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
     let mut encoder = GzipEncoder::new(Vec::new());
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 3bd8f4c8ef..9f78242ed3 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -31,7 +31,7 @@ pub struct Waiters<T>(pub(self) Mutex<HashMap<String, oneshot::Sender<T>>>);
 
 impl<T> Default for Waiters<T> {
     fn default() -> Self {
-        Waiters(Default::default())
+        Waiters(Mutex::default())
     }
 }
 

From 2dd53e7ae0adf7c8a5856bb86a287eddd591718d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 26 Aug 2024 17:30:19 +0200
Subject: [PATCH 031/159] Timeline archival test (#8824)

This PR:

* Implements the rule that archived timelines require all of their
children to be archived as well, as specified in the RFC. There is no
fancy locking mechanism though, so the precondition can still be broken.
As a TODO for later, we still allow unarchiving timelines with archived
parents.
* Adds an `is_archived` flag to `TimelineInfo`
* Adds timeline_archival_config to `PageserverHttpClient`
* Adds a new `test_timeline_archive` test, loosely based on
`test_timeline_delete`

Part of #8088
---
 libs/pageserver_api/src/models.rs            |  1 +
 pageserver/src/http/routes.rs                | 25 ++++-
 pageserver/src/tenant.rs                     | 70 ++++++++++++--
 test_runner/fixtures/common_types.py         |  7 ++
 test_runner/fixtures/pageserver/http.py      | 18 +++-
 test_runner/regress/test_timeline_archive.py | 96 ++++++++++++++++++++
 6 files changed, 207 insertions(+), 10 deletions(-)
 create mode 100644 test_runner/regress/test_timeline_archive.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d55c06b685..4cab56771b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -718,6 +718,7 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
+    pub is_archived: bool,
 
     pub walreceiver_status: String,
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 4635e76ea9..cbcc162b32 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -318,6 +318,24 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
     }
 }
 
+impl From<crate::tenant::TimelineArchivalError> for ApiError {
+    fn from(value: crate::tenant::TimelineArchivalError) -> Self {
+        use crate::tenant::TimelineArchivalError::*;
+        match value {
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
+            Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
+                format!(
+                    "Cannot archive timeline which has non-archived child timelines: {children:?}"
+                )
+                .into_boxed_str(),
+            ),
+            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            Other(e) => ApiError::InternalServerError(e),
+        }
+    }
+}
+
 impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
     fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
         use crate::tenant::mgr::DeleteTimelineError::*;
@@ -405,6 +423,8 @@ async fn build_timeline_info_common(
     let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
     let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
+    // Report is_archived = false if the timeline is still loading
+    let is_archived = timeline.is_archived().unwrap_or(false);
     let remote_consistent_lsn_projected = timeline
         .get_remote_consistent_lsn_projected()
         .unwrap_or(Lsn(0));
@@ -445,6 +465,7 @@ async fn build_timeline_info_common(
         pg_version: timeline.pg_version,
 
         state,
+        is_archived,
 
         walreceiver_status,
 
@@ -686,9 +707,7 @@ async fn timeline_archival_config_handler(
 
         tenant
             .apply_timeline_archival_config(timeline_id, request_data.state)
-            .await
-            .context("applying archival config")
-            .map_err(ApiError::InternalServerError)?;
+            .await?;
         Ok::<_, ApiError>(())
     }
     .instrument(info_span!("timeline_archival_config",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3a7afff211..d3589a12c8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError {
     }
 }
 
+#[derive(thiserror::Error)]
+pub enum TimelineArchivalError {
+    #[error("NotFound")]
+    NotFound,
+
+    #[error("Timeout")]
+    Timeout,
+
+    #[error("HasUnarchivedChildren")]
+    HasUnarchivedChildren(Vec<TimelineId>),
+
+    #[error("Timeline archival is already in progress")]
+    AlreadyInProgress,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl Debug for TimelineArchivalError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::Timeout => write!(f, "Timeout"),
+            Self::HasUnarchivedChildren(c) => {
+                f.debug_tuple("HasUnarchivedChildren").field(c).finish()
+            }
+            Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
     AlreadyStopping(completion::Barrier),
     Broken,
@@ -1326,24 +1358,50 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         state: TimelineArchivalState,
-    ) -> anyhow::Result<()> {
-        let timeline = self
-            .get_timeline(timeline_id, false)
-            .context("Cannot apply timeline archival config to inexistent timeline")?;
+    ) -> Result<(), TimelineArchivalError> {
+        info!("setting timeline archival config");
+        let timeline = {
+            let timelines = self.timelines.lock().unwrap();
+
+            let timeline = match timelines.get(&timeline_id) {
+                Some(t) => t,
+                None => return Err(TimelineArchivalError::NotFound),
+            };
+
+            // Ensure that there are no non-archived child timelines
+            let children: Vec<TimelineId> = timelines
+                .iter()
+                .filter_map(|(id, entry)| {
+                    if entry.get_ancestor_timeline_id() != Some(timeline_id) {
+                        return None;
+                    }
+                    if entry.is_archived() == Some(true) {
+                        return None;
+                    }
+                    Some(*id)
+                })
+                .collect();
+
+            if !children.is_empty() && state == TimelineArchivalState::Archived {
+                return Err(TimelineArchivalError::HasUnarchivedChildren(children));
+            }
+            Arc::clone(timeline)
+        };
 
         let upload_needed = timeline
             .remote_client
             .schedule_index_upload_for_timeline_archival_state(state)?;
 
         if upload_needed {
+            info!("Uploading new state");
             const MAX_WAIT: Duration = Duration::from_secs(10);
             let Ok(v) =
                 tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
             else {
                 tracing::warn!("reached timeout for waiting on upload queue");
-                bail!("reached timeout for upload queue flush");
+                return Err(TimelineArchivalError::Timeout);
             };
-            v?;
+            v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
         }
         Ok(())
     }
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 7cadcbb4c2..8eda19d1e2 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -1,5 +1,6 @@
 import random
 from dataclasses import dataclass
+from enum import Enum
 from functools import total_ordering
 from typing import Any, Dict, Type, TypeVar, Union
 
@@ -213,3 +214,9 @@ class TenantShardId:
 
     def __hash__(self) -> int:
         return hash(self._tuple())
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TimelineArchivalState(str, Enum):
+    ARCHIVED = "Archived"
+    UNARCHIVED = "Unarchived"
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index cd4261f1b8..582f9c0264 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -10,7 +10,7 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
@@ -621,6 +621,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
 
+    def timeline_archival_config(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        state: TimelineArchivalState,
+    ):
+        config = {"state": state.value}
+        log.info(
+            f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
+        )
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
+            json=config,
+        )
+        self.verbose_error(res)
+
     def timeline_get_lsn_by_timestamp(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
new file mode 100644
index 0000000000..b774c7c9fe
--- /dev/null
+++ b/test_runner/regress/test_timeline_archive.py
@@ -0,0 +1,96 @@
+import pytest
+from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
+from fixtures.neon_fixtures import (
+    NeonEnv,
+)
+from fixtures.pageserver.http import PageserverApiException
+
+
+def test_timeline_archive(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Timeline .* was not found.*",
+            ".*timeline not found.*",
+            ".*Cannot archive timeline which has unarchived child timelines.*",
+            ".*Precondition failed: Requested tenant is missing.*",
+        ]
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    # first try to archive non existing timeline
+    # for existing tenant:
+    invalid_timeline_id = TimelineId.generate()
+    with pytest.raises(PageserverApiException, match="timeline not found") as exc:
+        ps_http.timeline_archival_config(
+            tenant_id=env.initial_tenant,
+            timeline_id=invalid_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+
+    assert exc.value.status_code == 404
+
+    # for non existing tenant:
+    invalid_tenant_id = TenantId.generate()
+    with pytest.raises(
+        PageserverApiException,
+        match=f"NotFound: tenant {invalid_tenant_id}",
+    ) as exc:
+        ps_http.timeline_archival_config(
+            tenant_id=invalid_tenant_id,
+            timeline_id=invalid_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+
+    assert exc.value.status_code == 404
+
+    # construct pair of branches to validate that pageserver prohibits
+    # archival of ancestor timelines when they have non-archived child branches
+    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty")
+
+    leaf_timeline_id = env.neon_cli.create_branch(
+        "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent"
+    )
+
+    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
+
+    with pytest.raises(
+        PageserverApiException,
+        match="Cannot archive timeline which has non-archived child timelines",
+    ) as exc:
+        assert timeline_path.exists()
+
+        ps_http.timeline_archival_config(
+            tenant_id=env.initial_tenant,
+            timeline_id=parent_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+
+    assert exc.value.status_code == 412
+
+    # Test timeline_detail
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is False
+
+    # Test that archiving the leaf timeline and then the parent works
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is True
+
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=parent_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )

From 97241776aa6ca9612d580b2ef87caab8bd292c4e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 26 Aug 2024 18:07:55 +0200
Subject: [PATCH 032/159] pageserver: startup: ensure local disk state is
 durable (#8835)

refs https://github.com/neondatabase/neon/issues/6989

Problem
-------

After unclean shutdown, we get restarted, start reading the local
filesystem,
and make decisions based on those reads. However, some of the data might
have
not yet been fsynced when the unclean shutdown completed.

Durability matters even though Pageservers are conceptually just a cache
of state in S3. For example:
- the cloud control plane is no control loop => pageserver responses
  to tenant attachmentm, etc, needs to be durable.
  - the storage controller does not rely on this (as much?)
- we don't have layer file checksumming, so, downloaded+renamed but not
  fsynced layer files are technically not to be trusted
  - https://github.com/neondatabase/neon/issues/2683

Solution
--------

`syncfs` the tenants directory during startup, before we start reading
from it.

This is a bit overkill because we do remove some temp files
(InMemoryLayer!)
later during startup. Further, these temp files are particularly likely
to
be dirty in the kernel page cache. However, we don't want to refactor
that
cleanup code right now, and the dirty data on pageservers is generally
not that high. Last, with [direct
IO](https://github.com/neondatabase/neon/issues/8130) we're going to
have near-zero kernel page cache anyway quite soon.
---
 Cargo.toml                       |  2 +-
 pageserver/src/bin/pageserver.rs | 52 ++++++++++++++++++++++++++++++--
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 963841e340..e038c0b4ff 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,7 +113,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.8"
-nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index da0c11d9bf..7d404e50a5 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,10 +126,56 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
 
+    // The tenants directory contains all the pageserver local disk state.
+    // Create if not exists and make sure all the contents are durable before proceeding.
+    // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
+    // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
+    // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
     let tenants_path = conf.tenants_path();
-    if !tenants_path.exists() {
-        utils::crashsafe::create_dir_all(conf.tenants_path())
-            .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
+    {
+        let open = || {
+            nix::dir::Dir::open(
+                tenants_path.as_std_path(),
+                nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
+                nix::sys::stat::Mode::empty(),
+            )
+        };
+        let dirfd = match open() {
+            Ok(dirfd) => dirfd,
+            Err(e) => match e {
+                nix::errno::Errno::ENOENT => {
+                    utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
+                        format!("Failed to create tenants root dir at '{tenants_path}'")
+                    })?;
+                    open().context("open tenants dir after creating it")?
+                }
+                e => anyhow::bail!(e),
+            },
+        };
+
+        let started = Instant::now();
+        // Linux guarantees durability for syncfs.
+        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+        #[cfg(target_os = "linux")]
+        {
+            use std::os::fd::AsRawFd;
+            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
+        }
+        #[cfg(target_os = "macos")]
+        {
+            // macOS is not a production platform for Neon, don't even bother.
+            drop(dirfd);
+        }
+        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        {
+            compile_error!("Unsupported OS");
+        }
+
+        let elapsed = started.elapsed();
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "made tenant directory contents durable"
+        );
     }
 
     // Initialize up failpoints support

From 0f6568426342f80c6faba81af9827cc4ad9fa2d6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 27 Aug 2024 02:19:47 +0800
Subject: [PATCH 033/159] feat(pageserver): use split layer writer in
 gc-compaction (#8608)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Part of #8002, the final big PR in the batch.

## Summary of changes

This pull request uses the new split layer writer in the gc-compaction.

* It changes how layers are split. Previously, we split layers based on
the original split point, but this creates too many layers
(test_gc_feedback has one key per layer).
* Therefore, we first verify if the layer map can be processed by the
current algorithm (See https://github.com/neondatabase/neon/pull/8191,
it's basically the same check)
* On that, we proceed with the compaction. This way, it creates a large
enough layer close to the target layer size.
* Added a new set of functions `with_discard` in the split layer writer.
This helps us skip layers if we are going to produce the same persistent
key.
* The delta writer will keep the updates of the same key in a single
file. This might create a super large layer, but we can optimize it
later.
* The split layer writer is used in the gc-compaction algorithm, and it
will split layers based on size.
* Fix the image layer summary block encoded the wrong key range.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/pageserver_api/src/key.rs                |   9 +
 pageserver/src/tenant.rs                      | 220 ++++++++-
 pageserver/src/tenant/storage_layer.rs        |   1 -
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  25 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   2 +
 .../src/tenant/storage_layer/split_writer.rs  | 369 ++++++++++++---
 pageserver/src/tenant/timeline.rs             |  17 +-
 pageserver/src/tenant/timeline/compaction.rs  | 447 ++++++++----------
 9 files changed, 751 insertions(+), 343 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 2fdd7de38f..77da58d63e 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -236,6 +236,15 @@ impl Key {
         field5: u8::MAX,
         field6: u32::MAX,
     };
+    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
+    pub const NON_L0_MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX - 1,
+    };
 
     pub fn from_hex(s: &str) -> Result<Self> {
         if s.len() != 36 {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d3589a12c8..0364d521b6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7071,18 +7071,14 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: {
-                        let mut key = Key::MAX;
-                        key.field6 -= 1;
-                        Key::MIN..key
-                    },
+                    key_range: Key::MIN..Key::NON_L0_MAX,
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
-                // The delta layer that is cut in the middle
+                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
                 PersistentLayerKey {
-                    key_range: get_key(3)..get_key(4),
-                    lsn_range: Lsn(0x30)..Lsn(0x41),
+                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    lsn_range: Lsn(0x30)..Lsn(0x48),
                     is_delta: true
                 },
                 // The delta3 layer that should not be picked for the compaction
@@ -8062,6 +8058,214 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
+    {
+        let harness =
+            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
+                .await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x38),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    // delta1 and delta 2 only contain a single key but multiple updates
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![
+                    (Lsn(0x10), tline.timeline_id),
+                    (Lsn(0x20), tline.timeline_id),
+                ],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let expected_result = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
+        ];
+
+        let expected_result_at_gc_horizon = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_20 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10@0x20"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let expected_result_at_lsn_10 = [
+            Bytes::from_static(b"value 0@0x10"),
+            Bytes::from_static(b"value 1@0x10"),
+            Bytes::from_static(b"value 2@0x10"),
+            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 4@0x10"),
+            Bytes::from_static(b"value 5@0x10"),
+            Bytes::from_static(b"value 6@0x10"),
+            Bytes::from_static(b"value 7@0x10"),
+            Bytes::from_static(b"value 8@0x10"),
+            Bytes::from_static(b"value 9@0x10"),
+        ];
+
+        let verify_result = || async {
+            let gc_horizon = {
+                let gc_info = tline.gc_info.read().unwrap();
+                gc_info.cutoffs.time
+            };
+            for idx in 0..10 {
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), gc_horizon, &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_gc_horizon[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_20[idx]
+                );
+                assert_eq!(
+                    tline
+                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
+                        .await
+                        .unwrap(),
+                    &expected_result_at_lsn_10[idx]
+                );
+            }
+        };
+
+        verify_result().await;
+
+        let cancel = CancellationToken::new();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        // compact again
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        verify_result().await;
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 133b34b8b5..a1202ad507 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,7 +8,6 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
 
-#[cfg(test)]
 pub mod split_writer;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b1b5217f7f..f4a2957972 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,6 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
+use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -568,7 +569,6 @@ impl DeltaLayerWriterInner {
         // 5GB limit for objects without multipart upload (which we don't want to use)
         // Make it a little bit below to account for differing GB units
         // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
-        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
         ensure!(
             metadata.len() <= S3_UPLOAD_LIMIT,
             "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
@@ -702,12 +702,10 @@ impl DeltaLayerWriter {
         self.inner.take().unwrap().finish(key_end, ctx).await
     }
 
-    #[cfg(test)]
     pub(crate) fn num_keys(&self) -> usize {
         self.inner.as_ref().unwrap().num_keys
     }
 
-    #[cfg(test)]
     pub(crate) fn estimated_size(&self) -> u64 {
         let inner = self.inner.as_ref().unwrap();
         inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 94120a4e3e..3cb2b1c83a 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -716,10 +716,6 @@ struct ImageLayerWriterInner {
 }
 
 impl ImageLayerWriterInner {
-    fn size(&self) -> u64 {
-        self.tree.borrow_writer().size() + self.blob_writer.size()
-    }
-
     ///
     /// Start building a new image layer.
     ///
@@ -854,13 +850,19 @@ impl ImageLayerWriterInner {
             res?;
         }
 
+        let final_key_range = if let Some(end_key) = end_key {
+            self.key_range.start..end_key
+        } else {
+            self.key_range.clone()
+        };
+
         // Fill in the summary on blk 0
         let summary = Summary {
             magic: IMAGE_FILE_MAGIC,
             format_version: STORAGE_FORMAT_VERSION,
             tenant_id: self.tenant_shard_id.tenant_id,
             timeline_id: self.timeline_id,
-            key_range: self.key_range.clone(),
+            key_range: final_key_range.clone(),
             lsn: self.lsn,
             index_start_blk,
             index_root_blk,
@@ -881,11 +883,7 @@ impl ImageLayerWriterInner {
         let desc = PersistentLayerDesc::new_img(
             self.tenant_shard_id,
             self.timeline_id,
-            if let Some(end_key) = end_key {
-                self.key_range.start..end_key
-            } else {
-                self.key_range.clone()
-            },
+            final_key_range,
             self.lsn,
             metadata.len(),
         );
@@ -974,14 +972,12 @@ impl ImageLayerWriter {
         self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
-    #[cfg(test)]
     /// Estimated size of the image layer.
     pub(crate) fn estimated_size(&self) -> u64 {
         let inner = self.inner.as_ref().unwrap();
         inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
     }
 
-    #[cfg(test)]
     pub(crate) fn num_keys(&self) -> usize {
         self.inner.as_ref().unwrap().num_keys
     }
@@ -997,7 +993,6 @@ impl ImageLayerWriter {
         self.inner.take().unwrap().finish(timeline, ctx, None).await
     }
 
-    #[cfg(test)]
     /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
     pub(super) async fn finish_with_end_key(
         mut self,
@@ -1011,10 +1006,6 @@ impl ImageLayerWriter {
             .finish(timeline, ctx, Some(end_key))
             .await
     }
-
-    pub(crate) fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
-    }
 }
 
 impl Drop for ImageLayerWriter {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 2607b574e7..53bb66b95e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -35,6 +35,8 @@ mod tests;
 #[cfg(test)]
 mod failpoints;
 
+pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index e12e29cd45..df910b5ad9 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,4 +1,4 @@
-use std::{ops::Range, sync::Arc};
+use std::{future::Future, ops::Range, sync::Arc};
 
 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
@@ -7,7 +7,32 @@ use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
 
-use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
+use super::layer::S3_UPLOAD_LIMIT;
+use super::{
+    DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
+};
+
+pub(crate) enum SplitWriterResult {
+    Produced(ResidentLayer),
+    Discarded(PersistentLayerKey),
+}
+
+#[cfg(test)]
+impl SplitWriterResult {
+    fn into_resident_layer(self) -> ResidentLayer {
+        match self {
+            SplitWriterResult::Produced(layer) => layer,
+            SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
+        }
+    }
+
+    fn into_discarded_layer(self) -> PersistentLayerKey {
+        match self {
+            SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
+            SplitWriterResult::Discarded(layer) => layer,
+        }
+    }
+}
 
 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -16,11 +41,12 @@ use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
     target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
+    generated_layers: Vec<SplitWriterResult>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
     lsn: Lsn,
+    start_key: Key,
 }
 
 impl SplitImageLayerWriter {
@@ -49,16 +75,22 @@ impl SplitImageLayerWriter {
             timeline_id,
             tenant_shard_id,
             lsn,
+            start_key,
         })
     }
 
-    pub async fn put_image(
+    pub async fn put_image_with_discard_fn<D, F>(
         &mut self,
         key: Key,
         img: Bytes,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
         // The current estimation is an upper bound of the space that the key/image could take
         // because we did not consider compression in this estimation. The resulting image layer
         // could be smaller than the target size.
@@ -76,33 +108,87 @@ impl SplitImageLayerWriter {
             )
             .await?;
             let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.generated_layers.push(
-                prev_image_writer
-                    .finish_with_end_key(tline, key, ctx)
-                    .await?,
-            );
+            let layer_key = PersistentLayerKey {
+                key_range: self.start_key..key,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+                is_delta: false,
+            };
+            self.start_key = key;
+
+            if discard(&layer_key).await {
+                drop(prev_image_writer);
+                self.generated_layers
+                    .push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                self.generated_layers.push(SplitWriterResult::Produced(
+                    prev_image_writer
+                        .finish_with_end_key(tline, key, ctx)
+                        .await?,
+                ));
+            }
         }
         self.inner.put_image(key, img, ctx).await
     }
 
-    pub(crate) async fn finish(
+    #[cfg(test)]
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
+            .await
+    }
+
+    pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
         let Self {
             mut generated_layers,
             inner,
             ..
         } = self;
-        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
+        }
+        let layer_key = PersistentLayerKey {
+            key_range: self.start_key..end_key,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+            is_delta: false,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            generated_layers.push(SplitWriterResult::Produced(
+                inner.finish_with_end_key(tline, end_key, ctx).await?,
+            ));
+        }
         Ok(generated_layers)
     }
 
+    #[cfg(test)]
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+            .await
+    }
+
     /// When split writer fails, the caller should call this function and handle partially generated layers.
-    #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
         Ok((self.generated_layers, self.inner))
     }
 }
@@ -110,15 +196,21 @@ impl SplitImageLayerWriter {
 /// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
 /// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
 /// to be cleaned up).
+///
+/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
+/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
+/// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
     inner: DeltaLayerWriter,
     target_layer_size: u64,
-    generated_layers: Vec<ResidentLayer>,
+    generated_layers: Vec<SplitWriterResult>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
     lsn_range: Range<Lsn>,
+    last_key_written: Key,
+    start_key: Key,
 }
 
 impl SplitDeltaLayerWriter {
@@ -147,9 +239,74 @@ impl SplitDeltaLayerWriter {
             timeline_id,
             tenant_shard_id,
             lsn_range,
+            last_key_written: Key::MIN,
+            start_key,
         })
     }
 
+    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
+    pub async fn put_value_with_discard_fn<D, F>(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        //
+        // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
+        // strategy. https://github.com/neondatabase/neon/issues/8837
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            if key != self.last_key_written {
+                let next_delta_writer = DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    key,
+                    self.lsn_range.clone(),
+                    ctx,
+                )
+                .await?;
+                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+                let layer_key = PersistentLayerKey {
+                    key_range: self.start_key..key,
+                    lsn_range: self.lsn_range.clone(),
+                    is_delta: true,
+                };
+                self.start_key = key;
+                if discard(&layer_key).await {
+                    drop(prev_delta_writer);
+                    self.generated_layers
+                        .push(SplitWriterResult::Discarded(layer_key));
+                } else {
+                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+                    let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                    self.generated_layers
+                        .push(SplitWriterResult::Produced(delta_layer));
+                }
+            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
+                // We have to produce a very large file b/c a key is updated too often.
+                anyhow::bail!(
+                    "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
+                    key,
+                    self.inner.estimated_size()
+                );
+            }
+        }
+        self.last_key_written = key;
+        self.inner.put_value(key, lsn, val, ctx).await
+    }
+
     pub async fn put_value(
         &mut self,
         key: Key,
@@ -158,56 +315,64 @@ impl SplitDeltaLayerWriter {
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            let next_delta_writer = DeltaLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                key,
-                self.lsn_range.clone(),
-                ctx,
-            )
-            .await?;
-            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            self.generated_layers.push(delta_layer);
-        }
-        self.inner.put_value(key, lsn, val, ctx).await
+        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
+            .await
     }
 
-    pub(crate) async fn finish(
+    pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Key,
-    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
         let Self {
             mut generated_layers,
             inner,
             ..
         } = self;
-
-        let (desc, path) = inner.finish(end_key, ctx).await?;
-        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-        generated_layers.push(delta_layer);
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
+        }
+        let layer_key = PersistentLayerKey {
+            key_range: self.start_key..end_key,
+            lsn_range: self.lsn_range.clone(),
+            is_delta: true,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            let (desc, path) = inner.finish(end_key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(delta_layer));
+        }
         Ok(generated_layers)
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
     #[allow(dead_code)]
-    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+            .await
+    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
         Ok((self.generated_layers, self.inner))
     }
 }
 
 #[cfg(test)]
 mod tests {
+    use itertools::Itertools;
     use rand::{RngCore, SeedableRng};
 
     use crate::{
@@ -302,9 +467,16 @@ mod tests {
 
     #[tokio::test]
     async fn write_split() {
-        let harness = TenantHarness::create("split_writer_write_split")
-            .await
-            .unwrap();
+        write_split_helper("split_writer_write_split", false).await;
+    }
+
+    #[tokio::test]
+    async fn write_split_discard() {
+        write_split_helper("split_writer_write_split_discard", false).await;
+    }
+
+    async fn write_split_helper(harness_name: &'static str, discard: bool) {
+        let harness = TenantHarness::create(harness_name).await.unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let tline = tenant
@@ -338,16 +510,19 @@ mod tests {
         for i in 0..N {
             let i = i as u32;
             image_writer
-                .put_image(get_key(i), get_large_img(), &tline, &ctx)
+                .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
+                    discard
+                })
                 .await
                 .unwrap();
             delta_writer
-                .put_value(
+                .put_value_with_discard_fn(
                     get_key(i),
                     Lsn(0x20),
                     Value::Image(get_large_img()),
                     &tline,
                     &ctx,
+                    |_| async { discard },
                 )
                 .await
                 .unwrap();
@@ -360,22 +535,39 @@ mod tests {
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].layer_desc().key_range.end,
-                    image_layers[idx].layer_desc().key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].layer_desc().key_range.end,
-                    delta_layers[idx].layer_desc().key_range.start
-                );
+        if discard {
+            for layer in image_layers {
+                layer.into_discarded_layer();
+            }
+            for layer in delta_layers {
+                layer.into_discarded_layer();
+            }
+        } else {
+            let image_layers = image_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            let delta_layers = delta_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            assert_eq!(image_layers.len(), N / 512 + 1);
+            assert_eq!(delta_layers.len(), N / 512 + 1);
+            for idx in 0..image_layers.len() {
+                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
+                if idx > 0 {
+                    assert_eq!(
+                        image_layers[idx - 1].layer_desc().key_range.end,
+                        image_layers[idx].layer_desc().key_range.start
+                    );
+                    assert_eq!(
+                        delta_layers[idx - 1].layer_desc().key_range.end,
+                        delta_layers[idx].layer_desc().key_range.start
+                    );
+                }
             }
         }
     }
@@ -456,4 +648,49 @@ mod tests {
             .unwrap();
         assert_eq!(layers.len(), 2);
     }
+
+    #[tokio::test]
+    async fn write_split_single_key() {
+        let harness = TenantHarness::create("split_writer_write_split_single_key")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        const N: usize = 2000;
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        for i in 0..N {
+            let i = i as u32;
+            delta_writer
+                .put_value(
+                    get_key(0),
+                    Lsn(i as u64 * 16 + 0x10),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                )
+                .await
+                .unwrap();
+        }
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(delta_layers.len(), 1);
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b33e436fce..098c196ee8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5444,12 +5444,17 @@ impl Timeline {
                 !(a.end <= b.start || b.end <= a.start)
             }
 
-            let guard = self.layers.read().await;
-            for layer in guard.layer_map()?.iter_historic_layers() {
-                if layer.is_delta()
-                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                    && layer.lsn_range != deltas.lsn_range
-                {
+            if deltas.key_range.start.next() != deltas.key_range.end {
+                let guard = self.layers.read().await;
+                let mut invalid_layers =
+                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
+                        layer.is_delta()
+                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                        && layer.lsn_range != deltas.lsn_range
+                        // skip single-key layer files
+                        && layer.key_range.start.next() != layer.key_range.end
+                    });
+                if let Some(layer) = invalid_layers.next() {
                     // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
                     panic!(
                         "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7370ec1386..aad75ac59c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -14,7 +14,7 @@ use super::{
     RecordedDuration, Timeline,
 };
 
-use anyhow::{anyhow, Context};
+use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
@@ -32,6 +32,9 @@ use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
+use crate::tenant::storage_layer::split_writer::{
+    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
+};
 use crate::tenant::storage_layer::{
     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
@@ -71,15 +74,60 @@ pub(crate) struct KeyHistoryRetention {
 }
 
 impl KeyHistoryRetention {
+    /// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
+    ///
+    /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
+    /// For example, consider the case where a single delta with range [0x10,0x50) exists.
+    /// And we have branches at LSN 0x10, 0x20, 0x30.
+    /// Then we delete branch @ 0x20.
+    /// Bottom-most compaction may now delete the delta [0x20,0x30).
+    /// And that wouldnt' change the shape of the layer.
+    ///
+    /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
+    ///
+    /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
+    async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
+        if dry_run {
+            return true;
+        }
+        let guard = tline.layers.read().await;
+        if !guard.contains_key(key) {
+            return false;
+        }
+        let layer_generation = guard.get_from_key(key).metadata().generation;
+        drop(guard);
+        if layer_generation == tline.generation {
+            info!(
+                key=%key,
+                ?layer_generation,
+                "discard layer due to duplicated layer key in the same generation",
+            );
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Pipe a history of a single key to the writers.
+    ///
+    /// If `image_writer` is none, the images will be placed into the delta layers.
+    /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
+    #[allow(clippy::too_many_arguments)]
     async fn pipe_to(
         self,
         key: Key,
-        delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        mut image_writer: Option<&mut ImageLayerWriter>,
+        tline: &Arc<Timeline>,
+        delta_writer: &mut SplitDeltaLayerWriter,
+        mut image_writer: Option<&mut SplitImageLayerWriter>,
         stat: &mut CompactionStatistics,
+        dry_run: bool,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
+        let discard = |key: &PersistentLayerKey| {
+            let key = key.clone();
+            async move { Self::discard_key(&key, tline, dry_run).await }
+        };
         for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
             if first_batch {
                 if logs.len() == 1 && logs[0].1.is_image() {
@@ -88,28 +136,45 @@ impl KeyHistoryRetention {
                     };
                     stat.produce_image_key(img);
                     if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer.put_image(key, img.clone(), ctx).await?;
+                        image_writer
+                            .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
+                            .await?;
                     } else {
-                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
+                        delta_writer
+                            .put_value_with_discard_fn(
+                                key,
+                                cutoff_lsn,
+                                Value::Image(img.clone()),
+                                tline,
+                                ctx,
+                                discard,
+                            )
+                            .await?;
                     }
                 } else {
                     for (lsn, val) in logs {
                         stat.produce_key(&val);
-                        delta_writer.push((key, lsn, val));
+                        delta_writer
+                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                            .await?;
                     }
                 }
                 first_batch = false;
             } else {
                 for (lsn, val) in logs {
                     stat.produce_key(&val);
-                    delta_writer.push((key, lsn, val));
+                    delta_writer
+                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                        .await?;
                 }
             }
         }
         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
         for (lsn, val) in above_horizon_logs {
             stat.produce_key(&val);
-            delta_writer.push((key, lsn, val));
+            delta_writer
+                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                .await?;
         }
         Ok(())
     }
@@ -1814,11 +1879,27 @@ impl Timeline {
             }
             let mut selected_layers = Vec::new();
             drop(gc_info);
+            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
+            let Some(max_layer_lsn) = layers
+                .iter_historic_layers()
+                .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
+                .map(|desc| desc.get_lsn_range().end)
+                .max()
+            else {
+                info!("no layers to compact with gc");
+                return Ok(());
+            };
+            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
+            // layers to compact.
             for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().start <= gc_cutoff {
+                if desc.get_lsn_range().end <= max_layer_lsn {
                     selected_layers.push(guard.get_from_desc(&desc));
                 }
             }
+            if selected_layers.is_empty() {
+                info!("no layers to compact with gc");
+                return Ok(());
+            }
             retain_lsns_below_horizon.sort();
             (selected_layers, gc_cutoff, retain_lsns_below_horizon)
         };
@@ -1848,27 +1929,53 @@ impl Timeline {
             lowest_retain_lsn
         );
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        // Also, collect the layer information to decide when to split the new delta layers.
-        let mut downloaded_layers = Vec::new();
-        let mut delta_split_points = BTreeSet::new();
+        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
+        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
         for layer in &layer_selection {
-            let resident_layer = layer.download_and_keep_resident().await?;
-            downloaded_layers.push(resident_layer);
-
             let desc = layer.layer_desc();
             if desc.is_delta() {
-                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
-                // so that we can avoid having too many small delta layers.
-                let key_range = desc.get_key_range();
-                delta_split_points.insert(key_range.start);
-                delta_split_points.insert(key_range.end);
+                // ignore single-key layer files
+                if desc.key_range.start.next() != desc.key_range.end {
+                    let lsn_range = &desc.lsn_range;
+                    lsn_split_point.insert(lsn_range.start);
+                    lsn_split_point.insert(lsn_range.end);
+                }
                 stat.visit_delta_layer(desc.file_size());
             } else {
                 stat.visit_image_layer(desc.file_size());
             }
         }
+        for layer in &layer_selection {
+            let desc = layer.layer_desc();
+            let key_range = &desc.key_range;
+            if desc.is_delta() && key_range.start.next() != key_range.end {
+                let lsn_range = desc.lsn_range.clone();
+                let intersects = lsn_split_point.range(lsn_range).collect_vec();
+                if intersects.len() > 1 {
+                    bail!(
+                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                        desc.key(),
+                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+                    );
+                }
+            }
+        }
+        // The maximum LSN we are processing in this compaction loop
+        let end_lsn = layer_selection
+            .iter()
+            .map(|l| l.layer_desc().lsn_range.end)
+            .max()
+            .unwrap();
+        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
+        // as an L0 layer.
+        let hack_end_key = Key::NON_L0_MAX;
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
+        let mut downloaded_layers = Vec::new();
+        for layer in &layer_selection {
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+        }
         for resident_layer in &downloaded_layers {
             if resident_layer.layer_desc().is_delta() {
                 let layer = resident_layer.get_as_delta(ctx).await?;
@@ -1884,138 +1991,17 @@ impl Timeline {
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
-        enum FlushDeltaResult {
-            /// Create a new resident layer
-            CreateResidentLayer(ResidentLayer),
-            /// Keep an original delta layer
-            KeepLayer(PersistentLayerKey),
-        }
-
-        #[allow(clippy::too_many_arguments)]
-        async fn flush_deltas(
-            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
-            last_key: Key,
-            delta_split_points: &[Key],
-            current_delta_split_point: &mut usize,
-            tline: &Arc<Timeline>,
-            lowest_retain_lsn: Lsn,
-            ctx: &RequestContext,
-            stats: &mut CompactionStatistics,
-            dry_run: bool,
-            last_batch: bool,
-        ) -> anyhow::Result<Option<FlushDeltaResult>> {
-            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
-            // overlapping layers.
-            //
-            // If we have a structure like this:
-            //
-            // | Delta 1 |         | Delta 4 |
-            // |---------| Delta 2 |---------|
-            // | Delta 3 |         | Delta 5 |
-            //
-            // And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
-            // A simple solution here is to split the delta layers using the original boundary, while this
-            // might produce a lot of small layers. This should be improved and fixed in the future.
-            let mut need_split = false;
-            while *current_delta_split_point < delta_split_points.len()
-                && last_key >= delta_split_points[*current_delta_split_point]
-            {
-                *current_delta_split_point += 1;
-                need_split = true;
-            }
-            if !need_split && !last_batch {
-                return Ok(None);
-            }
-            let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
-            if deltas.is_empty() {
-                return Ok(None);
-            }
-            let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
-            let delta_key = PersistentLayerKey {
-                key_range: {
-                    let key_start = deltas.first().unwrap().0;
-                    let key_end = deltas.last().unwrap().0.next();
-                    key_start..key_end
-                },
-                lsn_range: lowest_retain_lsn..end_lsn,
-                is_delta: true,
-            };
-            {
-                // Hack: skip delta layer if we need to produce a layer of a same key-lsn.
-                //
-                // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
-                // For example, consider the case where a single delta with range [0x10,0x50) exists.
-                // And we have branches at LSN 0x10, 0x20, 0x30.
-                // Then we delete branch @ 0x20.
-                // Bottom-most compaction may now delete the delta [0x20,0x30).
-                // And that wouldnt' change the shape of the layer.
-                //
-                // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
-                // That's why it's safe to skip.
-                let guard = tline.layers.read().await;
-
-                if guard.contains_key(&delta_key) {
-                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
-                    drop(guard);
-                    if layer_generation == tline.generation {
-                        stats.discard_delta_layer();
-                        // TODO: depending on whether we design this compaction process to run along with
-                        // other compactions, there could be layer map modifications after we drop the
-                        // layer guard, and in case it creates duplicated layer key, we will still error
-                        // in the end.
-                        info!(
-                            key=%delta_key,
-                            ?layer_generation,
-                            "discard delta layer due to duplicated layer in the same generation"
-                        );
-                        return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
-                    }
-                }
-            }
-
-            let mut delta_layer_writer = DeltaLayerWriter::new(
-                tline.conf,
-                tline.timeline_id,
-                tline.tenant_shard_id,
-                delta_key.key_range.start,
-                lowest_retain_lsn..end_lsn,
-                ctx,
-            )
-            .await?;
-            for (key, lsn, val) in deltas {
-                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-            }
-
-            stats.produce_delta_layer(delta_layer_writer.size());
-            if dry_run {
-                return Ok(None);
-            }
-
-            let (desc, path) = delta_layer_writer
-                .finish(delta_key.key_range.end, ctx)
-                .await?;
-            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
-            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
-        }
-
-        // Hack the key range to be min..(max-1). Otherwise, the image layer will be
-        // interpreted as an L0 delta layer.
-        let hack_image_layer_range = {
-            let mut end_key = Key::MAX;
-            end_key.field6 -= 1;
-            Key::MIN..end_key
-        };
-
         // Only create image layers when there is no ancestor branches. TODO: create covering image layer
         // when some condition meet.
         let mut image_layer_writer = if self.ancestor_timeline.is_none() {
             Some(
-                ImageLayerWriter::new(
+                SplitImageLayerWriter::new(
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    &hack_image_layer_range, // covers the full key range
+                    Key::MIN,
                     lowest_retain_lsn,
+                    self.get_compaction_target_size(),
                     ctx,
                 )
                 .await?,
@@ -2024,6 +2010,17 @@ impl Timeline {
             None
         };
 
+        let mut delta_layer_writer = SplitDeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            Key::MIN,
+            lowest_retain_lsn..end_lsn,
+            self.get_compaction_target_size(),
+            ctx,
+        )
+        .await?;
+
         /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
         ///
         /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2044,47 +2041,11 @@ impl Timeline {
             let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
             Ok(Some((key, tline.ancestor_lsn, img)))
         }
-        let image_layer_key = PersistentLayerKey {
-            key_range: hack_image_layer_range,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
-            is_delta: false,
-        };
-
-        // Like with delta layers, it can happen that we re-produce an already existing image layer.
-        // This could happen when a user triggers force compaction and image generation. In this case,
-        // it's always safe to rewrite the layer.
-        let discard_image_layer = {
-            let guard = self.layers.read().await;
-            if guard.contains_key(&image_layer_key) {
-                let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
-                drop(guard);
-                if layer_generation == self.generation {
-                    // TODO: depending on whether we design this compaction process to run along with
-                    // other compactions, there could be layer map modifications after we drop the
-                    // layer guard, and in case it creates duplicated layer key, we will still error
-                    // in the end.
-                    info!(
-                        key=%image_layer_key,
-                        ?layer_generation,
-                        "discard image layer due to duplicated layer key in the same generation",
-                    );
-                    true
-                } else {
-                    false
-                }
-            } else {
-                false
-            }
-        };
 
         // Actually, we can decide not to write to the image layer at all at this point because
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
-        let mut delta_values = Vec::new();
-        let delta_split_points = delta_split_points.into_iter().collect_vec();
-        let mut current_delta_split_point = 0;
-        let mut delta_layers = Vec::new();
         while let Some((key, lsn, val)) = merge_iter.next().await? {
             if cancel.is_cancelled() {
                 return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
@@ -2115,27 +2076,14 @@ impl Timeline {
                 retention
                     .pipe_to(
                         *last_key,
-                        &mut delta_values,
+                        self,
+                        &mut delta_layer_writer,
                         image_layer_writer.as_mut(),
                         &mut stat,
+                        dry_run,
                         ctx,
                     )
                     .await?;
-                delta_layers.extend(
-                    flush_deltas(
-                        &mut delta_values,
-                        *last_key,
-                        &delta_split_points,
-                        &mut current_delta_split_point,
-                        self,
-                        lowest_retain_lsn,
-                        ctx,
-                        &mut stat,
-                        dry_run,
-                        false,
-                    )
-                    .await?,
-                );
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
@@ -2159,43 +2107,75 @@ impl Timeline {
         retention
             .pipe_to(
                 last_key,
-                &mut delta_values,
+                self,
+                &mut delta_layer_writer,
                 image_layer_writer.as_mut(),
                 &mut stat,
+                dry_run,
                 ctx,
             )
             .await?;
-        delta_layers.extend(
-            flush_deltas(
-                &mut delta_values,
-                last_key,
-                &delta_split_points,
-                &mut current_delta_split_point,
-                self,
-                lowest_retain_lsn,
-                ctx,
-                &mut stat,
-                dry_run,
-                true,
-            )
-            .await?,
-        );
-        assert!(delta_values.is_empty(), "unprocessed keys");
 
-        let image_layer = if discard_image_layer {
-            stat.discard_image_layer();
-            None
-        } else if let Some(writer) = image_layer_writer {
-            stat.produce_image_layer(writer.size());
+        let discard = |key: &PersistentLayerKey| {
+            let key = key.clone();
+            async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
+        };
+
+        let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
-                Some(writer.finish(self, ctx).await?)
+                writer
+                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                    .await?
             } else {
-                None
+                let (layers, _) = writer.take()?;
+                assert!(layers.is_empty(), "image layers produced in dry run mode?");
+                Vec::new()
             }
         } else {
-            None
+            Vec::new()
         };
 
+        let produced_delta_layers = if !dry_run {
+            delta_layer_writer
+                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                .await?
+        } else {
+            let (layers, _) = delta_layer_writer.take()?;
+            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
+            Vec::new()
+        };
+
+        let mut compact_to = Vec::new();
+        let mut keep_layers = HashSet::new();
+        let produced_delta_layers_len = produced_delta_layers.len();
+        let produced_image_layers_len = produced_image_layers.len();
+        for action in produced_delta_layers {
+            match action {
+                SplitWriterResult::Produced(layer) => {
+                    stat.produce_delta_layer(layer.layer_desc().file_size());
+                    compact_to.push(layer);
+                }
+                SplitWriterResult::Discarded(l) => {
+                    keep_layers.insert(l);
+                    stat.discard_delta_layer();
+                }
+            }
+        }
+        for action in produced_image_layers {
+            match action {
+                SplitWriterResult::Produced(layer) => {
+                    stat.produce_image_layer(layer.layer_desc().file_size());
+                    compact_to.push(layer);
+                }
+                SplitWriterResult::Discarded(l) => {
+                    keep_layers.insert(l);
+                    stat.discard_image_layer();
+                }
+            }
+        }
+        let mut layer_selection = layer_selection;
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+
         info!(
             "gc-compaction statistics: {}",
             serde_json::to_string(&stat)?
@@ -2206,28 +2186,11 @@ impl Timeline {
         }
 
         info!(
-            "produced {} delta layers and {} image layers",
-            delta_layers.len(),
-            if image_layer.is_some() { 1 } else { 0 }
+            "produced {} delta layers and {} image layers, {} layers are kept",
+            produced_delta_layers_len,
+            produced_image_layers_len,
+            layer_selection.len()
         );
-        let mut compact_to = Vec::new();
-        let mut keep_layers = HashSet::new();
-        for action in delta_layers {
-            match action {
-                FlushDeltaResult::CreateResidentLayer(layer) => {
-                    compact_to.push(layer);
-                }
-                FlushDeltaResult::KeepLayer(l) => {
-                    keep_layers.insert(l);
-                }
-            }
-        }
-        if discard_image_layer {
-            keep_layers.insert(image_layer_key);
-        }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        compact_to.extend(image_layer);
 
         // Step 3: Place back to the layer map.
         {

From bf03713fa1d0d02a931d00d4625c6f0a2bb85645 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 26 Aug 2024 23:17:07 +0200
Subject: [PATCH 034/159] fix(sql-exporter): Fix typo in  gauge

In f4b3c317f there was a typo and I missed that on review
---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 0bacb63509..55a7cc5a9f 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -326,7 +326,7 @@ files:
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
       - metric_name: compute_logical_snapshot_files
-        type: guage
+        type: gauge
         help: 'Number of snapshot files in pg_logical/snapshot'
         key_labels:
           - tenant_id

From 7820c572e73160b0dfa4628edb1723c8527d6c7e Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 26 Aug 2024 23:25:57 +0200
Subject: [PATCH 035/159] fix(sql-exporter): Remove tenant_id from
 compute_logical_snapshot_files

It appeared to be that it's already auto-added to all metrics [1]

[1]: https://github.com/neondatabase/flux-fleet/blob/3a907c317c1e4515190b04a8726e00a9f976214a/apps/base/ext-vmagent/vmagent.yaml#L43
---
 vm-image-spec.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 55a7cc5a9f..c94f95f447 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -329,12 +329,10 @@ files:
         type: gauge
         help: 'Number of snapshot files in pg_logical/snapshot'
         key_labels:
-          - tenant_id
           - timeline_id
         values: [num_logical_snapshot_files]
         query: |
           SELECT
-            (SELECT setting FROM pg_settings WHERE name = 'neon.tenant_id') AS tenant_id,
             (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
             -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
             -- temporary snapshot files are renamed to the actual snapshot files after they are

From 09362b63635d46cecef77cb04bd1e406dabd2026 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 26 Aug 2024 16:12:51 +0300
Subject: [PATCH 036/159] safekeeper: reorder routes and their handlers.

Routes and their handlers were in a bit different order in 1) routes
list 2) their implementation 3) python client 4) openapi spec, making
addition of new ones intimidating. Make it the same everywhere, roughly
lexicographically but preserving some of existing logic.

No functional changes.
---
 safekeeper/src/http/openapi_spec.yaml   |  70 ++++----
 safekeeper/src/http/routes.rs           | 216 ++++++++++++------------
 test_runner/fixtures/safekeeper/http.py | 148 ++++++++--------
 3 files changed, 216 insertions(+), 218 deletions(-)

diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml
index a617e0310c..70999853c2 100644
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -86,42 +86,6 @@ paths:
         default:
           $ref: "#/components/responses/GenericError"
 
-  /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: source_timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-
-    post:
-      tags:
-      - "Timeline"
-      summary: Register new timeline as copy of existing timeline
-      description: ""
-      operationId: v1CopyTenantTimeline
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TimelineCopyRequest"
-      responses:
-        "201":
-          description: Timeline created
-          # TODO: return timeline info?
-        "403":
-          $ref: "#/components/responses/ForbiddenError"
-        default:
-          $ref: "#/components/responses/GenericError"
-
-
   /v1/tenant/{tenant_id}/timeline/{timeline_id}:
     parameters:
       - name: tenant_id
@@ -179,6 +143,40 @@ paths:
         default:
           $ref: "#/components/responses/GenericError"
 
+  /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: source_timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+
+    post:
+      tags:
+      - "Timeline"
+      summary: Register new timeline as copy of existing timeline
+      description: ""
+      operationId: v1CopyTenantTimeline
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TimelineCopyRequest"
+      responses:
+        "201":
+          description: Timeline created
+          # TODO: return timeline info?
+        "403":
+          $ref: "#/components/responses/ForbiddenError"
+        default:
+          $ref: "#/components/responses/GenericError"
 
   /v1/record_safekeeper_info/{tenant_id}/{timeline_id}:
     parameters:
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index d11815f6ef..91ffa95c21 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -114,7 +114,55 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
     })
 }
 
+/// Deactivates all timelines for the tenant and removes its data directory.
+/// See `timeline_delete_handler`.
+async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id = parse_request_param(&request, "tenant_id")?;
+    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
+    check_permission(&request, Some(tenant_id))?;
+    ensure_no_body(&mut request).await?;
+    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
+    // Using an `InternalServerError` should be fixed when the types support it
+    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(
+        StatusCode::OK,
+        delete_info
+            .iter()
+            .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp))
+            .collect::<HashMap<String, TimelineDeleteForceResult>>(),
+    )
+}
+
+async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
+
+    let ttid = TenantTimelineId {
+        tenant_id: request_data.tenant_id,
+        timeline_id: request_data.timeline_id,
+    };
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let server_info = ServerInfo {
+        pg_version: request_data.pg_version,
+        system_id: request_data.system_id.unwrap_or(0),
+        wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
+    };
+    let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
+        request_data
+            .commit_lsn
+            .segment_lsn(server_info.wal_seg_size as usize)
+    });
+    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// List all (not deleted) timelines.
+/// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
     let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
@@ -174,30 +222,21 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, status)
 }
 
-async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
-
-    let ttid = TenantTimelineId {
-        tenant_id: request_data.tenant_id,
-        timeline_id: request_data.timeline_id,
-    };
+/// Deactivates the timeline and removes its data directory.
+async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
     check_permission(&request, Some(ttid.tenant_id))?;
-
-    let server_info = ServerInfo {
-        pg_version: request_data.pg_version,
-        system_id: request_data.system_id.unwrap_or(0),
-        wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
-    };
-    let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
-        request_data
-            .commit_lsn
-            .segment_lsn(server_info.wal_seg_size as usize)
-    });
-    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+    ensure_no_body(&mut request).await?;
+    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
+    // error handling here when we're able to.
+    let resp = GlobalTimelines::delete(&ttid, only_local)
         .await
         .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
+    json_response(StatusCode::OK, resp)
 }
 
 /// Pull timeline from peer safekeeper instances.
@@ -279,6 +318,46 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
     json_response(StatusCode::OK, ())
 }
 
+async fn patch_control_file_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
+    let response = patch_control_file::handle_request(tli, patch_request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
+/// Force persist control file.
+async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid)?;
+    tli.write_shared_state()
+        .await
+        .sk
+        .state_mut()
+        .flush()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
         parse_request_param(&request, "tenant_id")?,
@@ -310,64 +389,6 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, response)
 }
 
-/// Force persist control file.
-async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-
-    let tli = GlobalTimelines::get(ttid)?;
-    tli.write_shared_state()
-        .await
-        .sk
-        .state_mut()
-        .flush()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    json_response(StatusCode::OK, ())
-}
-
-/// Deactivates the timeline and removes its data directory.
-async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
-    check_permission(&request, Some(ttid.tenant_id))?;
-    ensure_no_body(&mut request).await?;
-    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
-    // error handling here when we're able to.
-    let resp = GlobalTimelines::delete(&ttid, only_local)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    json_response(StatusCode::OK, resp)
-}
-
-/// Deactivates all timelines for the tenant and removes its data directory.
-/// See `timeline_delete_handler`.
-async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id = parse_request_param(&request, "tenant_id")?;
-    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
-    check_permission(&request, Some(tenant_id))?;
-    ensure_no_body(&mut request).await?;
-    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
-    // Using an `InternalServerError` should be fixed when the types support it
-    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    json_response(
-        StatusCode::OK,
-        delete_info
-            .iter()
-            .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp))
-            .collect::<HashMap<String, TimelineDeleteForceResult>>(),
-    )
-}
-
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -509,26 +530,6 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     Ok(response)
 }
 
-async fn patch_control_file_handler(
-    mut request: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-
-    let ttid = TenantTimelineId::new(
-        parse_request_param(&request, "tenant_id")?,
-        parse_request_param(&request, "timeline_id")?,
-    );
-
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-
-    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
-    let response = patch_control_file::handle_request(tli, patch_request)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
@@ -568,6 +569,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
                 failpoints_handler(r, cancel).await
             })
         })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            request_span(r, tenant_delete_handler)
+        })
         // Will be used in the future instead of implicit timeline creation
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
@@ -581,16 +585,13 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             request_span(r, timeline_delete_handler)
         })
-        .delete("/v1/tenant/:tenant_id", |r| {
-            request_span(r, tenant_delete_handler)
+        .post("/v1/pull_timeline", |r| {
+            request_span(r, timeline_pull_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
             |r| request_span(r, timeline_snapshot_handler),
         )
-        .post("/v1/pull_timeline", |r| {
-            request_span(r, timeline_pull_handler)
-        })
         .post(
             "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
             |r| request_span(r, timeline_copy_handler),
@@ -603,14 +604,13 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
             |r| request_span(r, timeline_checkpoint_handler),
         )
-        // for tests
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
+            request_span(r, timeline_digest_handler)
+        })
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
         .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
-            request_span(r, timeline_digest_handler)
-        })
 }
 
 #[cfg(test)]
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index dd3a0a3d54..05b43cfb72 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -65,6 +65,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
     def check_status(self):
         self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
 
+    def get_metrics_str(self) -> str:
+        """You probably want to use get_metrics() instead."""
+        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result.raise_for_status()
+        return request_result.text
+
+    def get_metrics(self) -> SafekeeperMetrics:
+        res = self.get_metrics_str()
+        return SafekeeperMetrics(parse_metrics(res))
+
     def is_testing_enabled_or_skip(self):
         if not self.is_testing_enabled:
             pytest.skip("safekeeper was built without 'testing' feature")
@@ -89,56 +99,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert res_json is None
         return res_json
 
-    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
-        params = params or {}
-        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
-        res.raise_for_status()
-        res_json = json.loads(res.text)
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def patch_control_file(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        patch: Dict[str, Any],
-    ) -> Dict[str, Any]:
-        res = self.patch(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
-            json={
-                "updates": patch,
-                "apply_fields": list(patch.keys()),
-            },
-        )
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
-        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
-        res.raise_for_status()
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
-            json=body,
-        )
-        res.raise_for_status()
-
-    def timeline_digest(
-        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
-    ) -> Dict[str, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
-            params={
-                "from_lsn": str(from_lsn),
-                "until_lsn": str(until_lsn),
-            },
-        )
+    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
         res.raise_for_status()
         res_json = res.json()
         assert isinstance(res_json, dict)
@@ -189,20 +151,6 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
-    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
-            json=body,
-        )
-        res.raise_for_status()
-
-    def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
-            json={},
-        )
-        res.raise_for_status()
-
     # only_local doesn't remove segments in the remote storage.
     def timeline_delete(
         self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
@@ -218,19 +166,71 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
-    def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]:
-        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+    def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        params = params or {}
+        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
+        res.raise_for_status()
+        res_json = json.loads(res.text)
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
+        res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
         res.raise_for_status()
         res_json = res.json()
         assert isinstance(res_json, dict)
         return res_json
 
-    def get_metrics_str(self) -> str:
-        """You probably want to use get_metrics() instead."""
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
-        request_result.raise_for_status()
-        return request_result.text
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
 
-    def get_metrics(self) -> SafekeeperMetrics:
-        res = self.get_metrics_str()
-        return SafekeeperMetrics(parse_metrics(res))
+    def patch_control_file(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        patch: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
+            json={
+                "updates": patch,
+                "apply_fields": list(patch.keys()),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
+            json={},
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
+            json=body,
+        )
+        res.raise_for_status()

From 5d527133a322a940d18f8613eb12078d2254fa07 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 27 Aug 2024 12:39:42 +0200
Subject: [PATCH 037/159] Fix the pg_hintplan flakyness (#8834)

## Problem
pg_hintplan test seems to be flaky, sometimes it fails, while usually it
passes

## Summary of changes

The regression test is changed to filter out the Neon service queries. The
expected file is changed as well.

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 Dockerfile.compute-node                       |  4 +-
 docker-compose/run-tests.sh                   |  2 +-
 .../{pg_hintplan.patch => pg_hint_plan.patch} | 40 +++++++++++++------
 3 files changed, 30 insertions(+), 16 deletions(-)
 rename patches/{pg_hintplan.patch => pg_hint_plan.patch} (55%)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 7acaf2f2fd..b6c89cd71f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hintplan.patch /ext-src
+COPY patches/pg_hint_plan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
-RUN patch -p1 < /ext-src/pg_hintplan.patch
+RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 58b2581197..3fc0b90071 100644
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -3,7 +3,7 @@ set -x
 
 cd /ext-src || exit 2
 FAILED=
-LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
        [ -d "${d}" ] || continue
diff --git a/patches/pg_hintplan.patch b/patches/pg_hint_plan.patch
similarity index 55%
rename from patches/pg_hintplan.patch
rename to patches/pg_hint_plan.patch
index 61a5ecbb90..4039a036df 100644
--- a/patches/pg_hintplan.patch
+++ b/patches/pg_hint_plan.patch
@@ -1,13 +1,7 @@
-commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Thu Jun 6 08:02:42 2024 +0000
-
-    Patch expected files to consider Neon's log messages
-
-diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
-index da723b8..f8d0102 100644
---- a/ext-src/pg_hint_plan-src/expected/ut-A.out
-+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
+diff --git a/expected/ut-A.out b/expected/ut-A.out
+index da723b8..5328114 100644
+--- a/expected/ut-A.out
++++ b/expected/ut-A.out
 @@ -9,13 +9,16 @@ SET search_path TO public;
  ----
  -- No.A-1-1-3
@@ -25,10 +19,18 @@ index da723b8..f8d0102 100644
  DROP SCHEMA other_schema;
  ----
  ---- No. A-5-1 comment pattern
-diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls
+   FROM public.pg_stat_statements s
+   JOIN pg_catalog.pg_database d
+     ON (s.dbid = d.oid)
++  WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
+  ORDER BY 1;
+                 query                 | calls 
+ --------------------------------------+-------
+diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
 index d372459..6282afe 100644
---- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
-+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+--- a/expected/ut-fdw.out
++++ b/expected/ut-fdw.out
 @@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
  SET client_min_messages TO LOG;
  SET pg_hint_plan.enable_hint TO on;
@@ -37,3 +39,15 @@ index d372459..6282afe 100644
  CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
  CREATE USER MAPPING FOR PUBLIC SERVER file_server;
  CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
+diff --git a/sql/ut-A.sql b/sql/ut-A.sql
+index 7c7d58a..4fd1a07 100644
+--- a/sql/ut-A.sql
++++ b/sql/ut-A.sql
+@@ -963,6 +963,7 @@ SELECT s.query, s.calls
+   FROM public.pg_stat_statements s
+   JOIN pg_catalog.pg_database d
+     ON (s.dbid = d.oid)
++  WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
+  ORDER BY 1;
+ 
+ ----

From 12850dd5e972ed2be27ce05cf36eff3f846d9aab Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 27 Aug 2024 12:00:35 +0100
Subject: [PATCH 038/159] proxy: remove dead code (#8847)

By marking everything possible as pub(crate), we find a few dead code
candidates.
---
 proxy/src/auth.rs                             | 24 ++---
 proxy/src/auth/backend.rs                     | 55 +++++------
 proxy/src/auth/backend/hacks.rs               |  4 +-
 proxy/src/auth/backend/jwt.rs                 | 18 ++--
 proxy/src/auth/backend/link.rs                |  4 +-
 proxy/src/auth/backend/local.rs               |  8 +-
 proxy/src/auth/credentials.rs                 | 20 ++--
 proxy/src/auth/flow.rs                        | 31 +++---
 proxy/src/auth/password_hack.rs               | 10 +-
 proxy/src/cache.rs                            | 10 +-
 proxy/src/cache/common.rs                     | 18 ++--
 proxy/src/cache/endpoints.rs                  |  6 +-
 proxy/src/cache/project_info.rs               | 24 ++---
 proxy/src/cache/timed_lru.rs                  | 45 ++-------
 proxy/src/cancellation.rs                     | 16 ++--
 proxy/src/compute.rs                          | 26 ++---
 proxy/src/console.rs                          |  2 +-
 proxy/src/console/messages.rs                 | 94 ++++++++++---------
 proxy/src/console/mgmt.rs                     |  6 +-
 proxy/src/console/provider.rs                 | 59 ++++++------
 proxy/src/console/provider/mock.rs            |  2 +-
 proxy/src/console/provider/neon.rs            |  6 +-
 proxy/src/context.rs                          | 57 +++++------
 proxy/src/context/parquet.rs                  |  6 +-
 proxy/src/error.rs                            | 10 +-
 proxy/src/http.rs                             | 16 ++--
 proxy/src/intern.rs                           | 34 ++++---
 proxy/src/lib.rs                              | 17 ++--
 proxy/src/parse.rs                            |  4 +-
 proxy/src/protocol2.rs                        |  6 +-
 proxy/src/proxy.rs                            | 40 ++++----
 proxy/src/proxy/connect_compute.rs            | 14 +--
 proxy/src/proxy/copy_bidirectional.rs         |  2 +-
 proxy/src/proxy/handshake.rs                  |  6 +-
 proxy/src/proxy/passthrough.rs                | 18 ++--
 proxy/src/proxy/retry.rs                      |  8 +-
 proxy/src/proxy/tests.rs                      | 13 +--
 proxy/src/proxy/tests/mitm.rs                 |  2 +-
 proxy/src/proxy/wake_compute.rs               |  2 +-
 proxy/src/rate_limiter.rs                     | 14 ++-
 proxy/src/rate_limiter/leaky_bucket.rs        |  9 +-
 proxy/src/rate_limiter/limit_algorithm.rs     | 58 ++++++------
 .../src/rate_limiter/limit_algorithm/aimd.rs  | 12 +--
 proxy/src/rate_limiter/limiter.rs             | 14 +--
 proxy/src/redis/cancellation_publisher.rs     |  2 +-
 .../connection_with_credentials_provider.rs   | 11 ++-
 proxy/src/redis/elasticache.rs                |  2 +-
 proxy/src/redis/notifications.rs              | 12 +--
 proxy/src/sasl.rs                             | 14 +--
 proxy/src/sasl/channel_binding.rs             | 11 ++-
 proxy/src/sasl/messages.rs                    |  8 +-
 proxy/src/sasl/stream.rs                      |  8 +-
 proxy/src/scram.rs                            | 10 +-
 proxy/src/scram/countmin.rs                   |  8 +-
 proxy/src/scram/exchange.rs                   |  6 +-
 proxy/src/scram/key.rs                        |  8 +-
 proxy/src/scram/messages.rs                   | 34 +++----
 proxy/src/scram/pbkdf2.rs                     |  8 +-
 proxy/src/scram/secret.rs                     | 20 ++--
 proxy/src/scram/signature.rs                  | 14 +--
 proxy/src/scram/threadpool.rs                 |  2 +-
 proxy/src/serverless.rs                       |  8 +-
 proxy/src/serverless/backend.rs               | 16 ++--
 proxy/src/serverless/cancel_set.rs            | 10 +-
 proxy/src/serverless/conn_pool.rs             | 50 +++++-----
 proxy/src/serverless/http_util.rs             |  6 +-
 proxy/src/serverless/json.rs                  |  6 +-
 proxy/src/serverless/sql_over_http.rs         | 10 +-
 proxy/src/serverless/websocket.rs             |  6 +-
 proxy/src/stream.rs                           | 15 +--
 proxy/src/url.rs                              |  4 +-
 proxy/src/usage_metrics.rs                    | 16 ++--
 proxy/src/waiters.rs                          | 14 +--
 73 files changed, 580 insertions(+), 609 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 3b3c571129..f3ecedb839 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -4,17 +4,17 @@ pub mod backend;
 pub use backend::BackendType;
 
 mod credentials;
-pub use credentials::{
+pub(crate) use credentials::{
     check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
     ComputeUserInfoParseError, IpPattern,
 };
 
 mod password_hack;
-pub use password_hack::parse_endpoint_param;
+pub(crate) use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;
 
 mod flow;
-pub use flow::*;
+pub(crate) use flow::*;
 use tokio::time::error::Elapsed;
 
 use crate::{
@@ -25,11 +25,11 @@ use std::{io, net::IpAddr};
 use thiserror::Error;
 
 /// Convenience wrapper for the authentication error.
-pub type Result<T> = std::result::Result<T, AuthError>;
+pub(crate) type Result<T> = std::result::Result<T, AuthError>;
 
 /// Common authentication error.
 #[derive(Debug, Error)]
-pub enum AuthErrorImpl {
+pub(crate) enum AuthErrorImpl {
     #[error(transparent)]
     Link(#[from] backend::LinkAuthError),
 
@@ -77,30 +77,30 @@ pub enum AuthErrorImpl {
 
 #[derive(Debug, Error)]
 #[error(transparent)]
-pub struct AuthError(Box<AuthErrorImpl>);
+pub(crate) struct AuthError(Box<AuthErrorImpl>);
 
 impl AuthError {
-    pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
+    pub(crate) fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
         AuthErrorImpl::BadAuthMethod(name.into()).into()
     }
 
-    pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
+    pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
         AuthErrorImpl::AuthFailed(user.into()).into()
     }
 
-    pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
+    pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
         AuthErrorImpl::IpAddressNotAllowed(ip).into()
     }
 
-    pub fn too_many_connections() -> Self {
+    pub(crate) fn too_many_connections() -> Self {
         AuthErrorImpl::TooManyConnections.into()
     }
 
-    pub fn is_auth_failed(&self) -> bool {
+    pub(crate) fn is_auth_failed(&self) -> bool {
         matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
     }
 
-    pub fn user_timeout(elapsed: Elapsed) -> Self {
+    pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
         AuthErrorImpl::UserTimeout(elapsed).into()
     }
 }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index bb9a0ddffc..77dea39fdc 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,7 +9,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use ipnet::{Ipv4Net, Ipv6Net};
-pub use link::LinkAuthError;
+pub(crate) use link::LinkAuthError;
 use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
@@ -74,12 +74,12 @@ pub enum BackendType<'a, T, D> {
     Local(MaybeOwned<'a, LocalBackend>),
 }
 
-pub trait TestBackend: Send + Sync + 'static {
+#[cfg(test)]
+pub(crate) trait TestBackend: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
     fn get_allowed_ips_and_secret(
         &self,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
-    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, (), ()> {
@@ -105,7 +105,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
 impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
+    pub(crate) fn as_ref(&self) -> BackendType<'_, &T, &D> {
         match self {
             Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
             Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
@@ -118,7 +118,7 @@ impl<'a, T, D> BackendType<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
+    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(c, f(x)),
             Self::Link(c, x) => BackendType::Link(c, x),
@@ -129,7 +129,7 @@ impl<'a, T, D> BackendType<'a, T, D> {
 impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
+    pub(crate) fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
         match self {
             Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
             Self::Link(c, x) => Ok(BackendType::Link(c, x)),
@@ -138,31 +138,31 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     }
 }
 
-pub struct ComputeCredentials {
-    pub info: ComputeUserInfo,
-    pub keys: ComputeCredentialKeys,
+pub(crate) struct ComputeCredentials {
+    pub(crate) info: ComputeUserInfo,
+    pub(crate) keys: ComputeCredentialKeys,
 }
 
 #[derive(Debug, Clone)]
-pub struct ComputeUserInfoNoEndpoint {
-    pub user: RoleName,
-    pub options: NeonOptions,
+pub(crate) struct ComputeUserInfoNoEndpoint {
+    pub(crate) user: RoleName,
+    pub(crate) options: NeonOptions,
 }
 
 #[derive(Debug, Clone)]
-pub struct ComputeUserInfo {
-    pub endpoint: EndpointId,
-    pub user: RoleName,
-    pub options: NeonOptions,
+pub(crate) struct ComputeUserInfo {
+    pub(crate) endpoint: EndpointId,
+    pub(crate) user: RoleName,
+    pub(crate) options: NeonOptions,
 }
 
 impl ComputeUserInfo {
-    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
+    pub(crate) fn endpoint_cache_key(&self) -> EndpointCacheKey {
         self.options.get_cache_key(&self.endpoint)
     }
 }
 
-pub enum ComputeCredentialKeys {
+pub(crate) enum ComputeCredentialKeys {
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
     None,
@@ -222,7 +222,7 @@ impl RateBucketInfo {
 }
 
 impl AuthenticationConfig {
-    pub fn check_rate_limit(
+    pub(crate) fn check_rate_limit(
         &self,
         ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
@@ -404,17 +404,8 @@ async fn authenticate_with_secret(
 }
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
-    /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<EndpointId> {
-        match self {
-            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
-            Self::Link(_, ()) => Some("link".into()),
-            Self::Local(_) => Some("local".into()),
-        }
-    }
-
     /// Get username from the credentials.
-    pub fn get_user(&self) -> &str {
+    pub(crate) fn get_user(&self) -> &str {
         match self {
             Self::Console(_, user_info) => &user_info.user,
             Self::Link(_, ()) => "link",
@@ -424,7 +415,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 
     /// Authenticate the client via the requested backend, possibly using credentials.
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
-    pub async fn authenticate(
+    pub(crate) async fn authenticate(
         self,
         ctx: &RequestMonitoring,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -471,7 +462,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 }
 
 impl BackendType<'_, ComputeUserInfo, &()> {
-    pub async fn get_role_secret(
+    pub(crate) async fn get_role_secret(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
@@ -482,7 +473,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         }
     }
 
-    pub async fn get_allowed_ips_and_secret(
+    pub(crate) async fn get_allowed_ips_and_secret(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 56921dd949..e9019ce2cf 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -17,7 +17,7 @@ use tracing::{info, warn};
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
-pub async fn authenticate_cleartext(
+pub(crate) async fn authenticate_cleartext(
     ctx: &RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -59,7 +59,7 @@ pub async fn authenticate_cleartext(
 /// Workaround for clients which don't provide an endpoint (project) name.
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
-pub async fn password_hack_no_authentication(
+pub(crate) async fn password_hack_no_authentication(
     ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 61833e19ed..e98da82053 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -22,27 +22,27 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
 const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 
 /// How to get the JWT auth rules
-pub trait FetchAuthRules: Clone + Send + Sync + 'static {
+pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
         &self,
         role_name: RoleName,
     ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
 
-pub struct AuthRule {
-    pub id: String,
-    pub jwks_url: url::Url,
-    pub audience: Option<String>,
+pub(crate) struct AuthRule {
+    pub(crate) id: String,
+    pub(crate) jwks_url: url::Url,
+    pub(crate) audience: Option<String>,
 }
 
 #[derive(Default)]
-pub struct JwkCache {
+pub(crate) struct JwkCache {
     client: reqwest::Client,
 
     map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }
 
-pub struct JwkCacheEntry {
+pub(crate) struct JwkCacheEntry {
     /// Should refetch at least every hour to verify when old keys have been removed.
     /// Should refetch when new key IDs are seen only every 5 minutes or so
     last_retrieved: Instant,
@@ -75,7 +75,7 @@ impl KeySet {
     }
 }
 
-pub struct JwkCacheEntryLock {
+pub(crate) struct JwkCacheEntryLock {
     cached: ArcSwapOption<JwkCacheEntry>,
     lookup: tokio::sync::Semaphore,
 }
@@ -309,7 +309,7 @@ impl JwkCacheEntryLock {
 }
 
 impl JwkCache {
-    pub async fn check_jwt<F: FetchAuthRules>(
+    pub(crate) async fn check_jwt<F: FetchAuthRules>(
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 95f4614736..19515f95a8 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 #[derive(Debug, Error)]
-pub enum LinkAuthError {
+pub(crate) enum LinkAuthError {
     #[error(transparent)]
     WaiterRegister(#[from] waiters::RegisterError),
 
@@ -52,7 +52,7 @@ fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String {
     )
 }
 
-pub fn new_psql_session_id() -> String {
+pub(crate) fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 6d18564dd6..8124f568cf 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -16,16 +16,14 @@ use crate::{
 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
 
 pub struct LocalBackend {
-    pub jwks_cache: JwkCache,
-    pub postgres_addr: SocketAddr,
-    pub node_info: NodeInfo,
+    pub(crate) jwks_cache: JwkCache,
+    pub(crate) node_info: NodeInfo,
 }
 
 impl LocalBackend {
     pub fn new(postgres_addr: SocketAddr) -> Self {
         LocalBackend {
             jwks_cache: JwkCache::default(),
-            postgres_addr,
             node_info: NodeInfo {
                 config: {
                     let mut cfg = ConnCfg::new();
@@ -47,7 +45,7 @@ impl LocalBackend {
 }
 
 #[derive(Clone, Copy)]
-pub struct StaticAuthRules;
+pub(crate) struct StaticAuthRules;
 
 pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
 
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index cb06fcaf55..0e91ae570a 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -16,7 +16,7 @@ use thiserror::Error;
 use tracing::{info, warn};
 
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
-pub enum ComputeUserInfoParseError {
+pub(crate) enum ComputeUserInfoParseError {
     #[error("Parameter '{0}' is missing in startup packet.")]
     MissingKey(&'static str),
 
@@ -51,20 +51,20 @@ impl ReportableError for ComputeUserInfoParseError {
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct ComputeUserInfoMaybeEndpoint {
-    pub user: RoleName,
-    pub endpoint_id: Option<EndpointId>,
-    pub options: NeonOptions,
+pub(crate) struct ComputeUserInfoMaybeEndpoint {
+    pub(crate) user: RoleName,
+    pub(crate) endpoint_id: Option<EndpointId>,
+    pub(crate) options: NeonOptions,
 }
 
 impl ComputeUserInfoMaybeEndpoint {
     #[inline]
-    pub fn endpoint(&self) -> Option<&str> {
+    pub(crate) fn endpoint(&self) -> Option<&str> {
         self.endpoint_id.as_deref()
     }
 }
 
-pub fn endpoint_sni(
+pub(crate) fn endpoint_sni(
     sni: &str,
     common_names: &HashSet<String>,
 ) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
@@ -83,7 +83,7 @@ pub fn endpoint_sni(
 }
 
 impl ComputeUserInfoMaybeEndpoint {
-    pub fn parse(
+    pub(crate) fn parse(
         ctx: &RequestMonitoring,
         params: &StartupMessageParams,
         sni: Option<&str>,
@@ -173,12 +173,12 @@ impl ComputeUserInfoMaybeEndpoint {
     }
 }
 
-pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
+pub(crate) fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
     ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
 }
 
 #[derive(Debug, Clone, Eq, PartialEq)]
-pub enum IpPattern {
+pub(crate) enum IpPattern {
     Subnet(ipnet::IpNet),
     Range(IpAddr, IpAddr),
     Single(IpAddr),
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index acf7b4f6b6..f7e2b5296e 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -17,17 +17,20 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 /// Every authentication selector is supposed to implement this trait.
-pub trait AuthMethod {
+pub(crate) trait AuthMethod {
     /// Any authentication selector should provide initial backend message
     /// containing auth method name and parameters, e.g. md5 salt.
     fn first_message(&self, channel_binding: bool) -> BeMessage<'_>;
 }
 
 /// Initial state of [`AuthFlow`].
-pub struct Begin;
+pub(crate) struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
+pub(crate) struct Scram<'a>(
+    pub(crate) &'a scram::ServerSecret,
+    pub(crate) &'a RequestMonitoring,
+);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -44,7 +47,7 @@ impl AuthMethod for Scram<'_> {
 
 /// Use an ad hoc auth flow (for clients which don't support SNI) proposed in
 /// <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
-pub struct PasswordHack;
+pub(crate) struct PasswordHack;
 
 impl AuthMethod for PasswordHack {
     #[inline(always)]
@@ -55,10 +58,10 @@ impl AuthMethod for PasswordHack {
 
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword {
-    pub pool: Arc<ThreadPool>,
-    pub endpoint: EndpointIdInt,
-    pub secret: AuthSecret,
+pub(crate) struct CleartextPassword {
+    pub(crate) pool: Arc<ThreadPool>,
+    pub(crate) endpoint: EndpointIdInt,
+    pub(crate) secret: AuthSecret,
 }
 
 impl AuthMethod for CleartextPassword {
@@ -70,7 +73,7 @@ impl AuthMethod for CleartextPassword {
 
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
-pub struct AuthFlow<'a, S, State> {
+pub(crate) struct AuthFlow<'a, S, State> {
     /// The underlying stream which implements libpq's protocol.
     stream: &'a mut PqStream<Stream<S>>,
     /// State might contain ancillary data (see [`Self::begin`]).
@@ -81,7 +84,7 @@ pub struct AuthFlow<'a, S, State> {
 /// Initial state of the stream wrapper.
 impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
     /// Create a new wrapper for client authentication.
-    pub fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
+    pub(crate) fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
         let tls_server_end_point = stream.get_ref().tls_server_end_point();
 
         Self {
@@ -92,7 +95,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
     }
 
     /// Move to the next step by sending auth method's name & params to client.
-    pub async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
+    pub(crate) async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
         self.stream
             .write_message(&method.first_message(self.tls_server_end_point.supported()))
             .await?;
@@ -107,7 +110,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
+    pub(crate) async fn get_password(self) -> super::Result<PasswordHackPayload> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
@@ -126,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
+    pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
@@ -151,7 +154,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
+    pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
         let Scram(secret, ctx) = self.state;
 
         // pause the timer while we communicate with the client
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 2ddf46fe25..3f5d006f7b 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -7,13 +7,13 @@ use bstr::ByteSlice;
 
 use crate::EndpointId;
 
-pub struct PasswordHackPayload {
-    pub endpoint: EndpointId,
-    pub password: Vec<u8>,
+pub(crate) struct PasswordHackPayload {
+    pub(crate) endpoint: EndpointId,
+    pub(crate) password: Vec<u8>,
 }
 
 impl PasswordHackPayload {
-    pub fn parse(bytes: &[u8]) -> Option<Self> {
+    pub(crate) fn parse(bytes: &[u8]) -> Option<Self> {
         // The format is `project=<utf-8>;<password-bytes>` or `project=<utf-8>$<password-bytes>`.
         let separators = [";", "$"];
         for sep in separators {
@@ -30,7 +30,7 @@ impl PasswordHackPayload {
     }
 }
 
-pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
+pub(crate) fn parse_endpoint_param(bytes: &str) -> Option<&str> {
     bytes
         .strip_prefix("project=")
         .or_else(|| bytes.strip_prefix("endpoint="))
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index d1d4087241..6c168144a7 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,7 +1,7 @@
-pub mod common;
-pub mod endpoints;
-pub mod project_info;
+pub(crate) mod common;
+pub(crate) mod endpoints;
+pub(crate) mod project_info;
 mod timed_lru;
 
-pub use common::{Cache, Cached};
-pub use timed_lru::TimedLru;
+pub(crate) use common::{Cache, Cached};
+pub(crate) use timed_lru::TimedLru;
diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index 82c78e3eb2..b5caf94788 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -3,7 +3,7 @@ use std::ops::{Deref, DerefMut};
 /// A generic trait which exposes types of cache's key and value,
 /// as well as the notion of cache entry invalidation.
 /// This is useful for [`Cached`].
-pub trait Cache {
+pub(crate) trait Cache {
     /// Entry's key.
     type Key;
 
@@ -29,21 +29,21 @@ impl<C: Cache> Cache for &C {
 }
 
 /// Wrapper for convenient entry invalidation.
-pub struct Cached<C: Cache, V = <C as Cache>::Value> {
+pub(crate) struct Cached<C: Cache, V = <C as Cache>::Value> {
     /// Cache + lookup info.
-    pub token: Option<(C, C::LookupInfo<C::Key>)>,
+    pub(crate) token: Option<(C, C::LookupInfo<C::Key>)>,
 
     /// The value itself.
-    pub value: V,
+    pub(crate) value: V,
 }
 
 impl<C: Cache, V> Cached<C, V> {
     /// Place any entry into this wrapper; invalidation will be a no-op.
-    pub fn new_uncached(value: V) -> Self {
+    pub(crate) fn new_uncached(value: V) -> Self {
         Self { token: None, value }
     }
 
-    pub fn take_value(self) -> (Cached<C, ()>, V) {
+    pub(crate) fn take_value(self) -> (Cached<C, ()>, V) {
         (
             Cached {
                 token: self.token,
@@ -53,7 +53,7 @@ impl<C: Cache, V> Cached<C, V> {
         )
     }
 
-    pub fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
+    pub(crate) fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
         Cached {
             token: self.token,
             value: f(self.value),
@@ -61,7 +61,7 @@ impl<C: Cache, V> Cached<C, V> {
     }
 
     /// Drop this entry from a cache if it's still there.
-    pub fn invalidate(self) -> V {
+    pub(crate) fn invalidate(self) -> V {
         if let Some((cache, info)) = &self.token {
             cache.invalidate(info);
         }
@@ -69,7 +69,7 @@ impl<C: Cache, V> Cached<C, V> {
     }
 
     /// Tell if this entry is actually cached.
-    pub fn cached(&self) -> bool {
+    pub(crate) fn cached(&self) -> bool {
         self.token.is_some()
     }
 }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 8c851790c2..f4762232d8 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -28,7 +28,7 @@ use crate::{
 };
 
 #[derive(Deserialize, Debug, Clone)]
-pub struct ControlPlaneEventKey {
+pub(crate) struct ControlPlaneEventKey {
     endpoint_created: Option<EndpointCreated>,
     branch_created: Option<BranchCreated>,
     project_created: Option<ProjectCreated>,
@@ -56,7 +56,7 @@ pub struct EndpointsCache {
 }
 
 impl EndpointsCache {
-    pub fn new(config: EndpointCacheConfig) -> Self {
+    pub(crate) fn new(config: EndpointCacheConfig) -> Self {
         Self {
             limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
                 config.limiter_info.clone(),
@@ -68,7 +68,7 @@ impl EndpointsCache {
             ready: AtomicBool::new(false),
         }
     }
-    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             return true;
         }
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index eda886a7af..ceae74a9a0 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -24,7 +24,7 @@ use crate::{
 use super::{Cache, Cached};
 
 #[async_trait]
-pub trait ProjectInfoCache {
+pub(crate) trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
     async fn decrement_active_listeners(&self);
@@ -37,7 +37,7 @@ struct Entry<T> {
 }
 
 impl<T> Entry<T> {
-    pub fn new(value: T) -> Self {
+    pub(crate) fn new(value: T) -> Self {
         Self {
             created_at: Instant::now(),
             value,
@@ -64,7 +64,7 @@ impl EndpointInfo {
             Some(t) => t < created_at,
         }
     }
-    pub fn get_role_secret(
+    pub(crate) fn get_role_secret(
         &self,
         role_name: RoleNameInt,
         valid_since: Instant,
@@ -81,7 +81,7 @@ impl EndpointInfo {
         None
     }
 
-    pub fn get_allowed_ips(
+    pub(crate) fn get_allowed_ips(
         &self,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
@@ -96,10 +96,10 @@ impl EndpointInfo {
         }
         None
     }
-    pub fn invalidate_allowed_ips(&mut self) {
+    pub(crate) fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
-    pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
+    pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
         self.secret.remove(&role_name);
     }
 }
@@ -178,7 +178,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
 }
 
 impl ProjectInfoCacheImpl {
-    pub fn new(config: ProjectInfoCacheOptions) -> Self {
+    pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self {
         Self {
             cache: DashMap::new(),
             project2ep: DashMap::new(),
@@ -189,7 +189,7 @@ impl ProjectInfoCacheImpl {
         }
     }
 
-    pub fn get_role_secret(
+    pub(crate) fn get_role_secret(
         &self,
         endpoint_id: &EndpointId,
         role_name: &RoleName,
@@ -212,7 +212,7 @@ impl ProjectInfoCacheImpl {
         }
         Some(Cached::new_uncached(value))
     }
-    pub fn get_allowed_ips(
+    pub(crate) fn get_allowed_ips(
         &self,
         endpoint_id: &EndpointId,
     ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
@@ -230,7 +230,7 @@ impl ProjectInfoCacheImpl {
         }
         Some(Cached::new_uncached(value))
     }
-    pub fn insert_role_secret(
+    pub(crate) fn insert_role_secret(
         &self,
         project_id: ProjectIdInt,
         endpoint_id: EndpointIdInt,
@@ -247,7 +247,7 @@ impl ProjectInfoCacheImpl {
             entry.secret.insert(role_name, secret.into());
         }
     }
-    pub fn insert_allowed_ips(
+    pub(crate) fn insert_allowed_ips(
         &self,
         project_id: ProjectIdInt,
         endpoint_id: EndpointIdInt,
@@ -319,7 +319,7 @@ impl ProjectInfoCacheImpl {
 
 /// Lookup info for project info cache.
 /// This is used to invalidate cache entries.
-pub struct CachedLookupInfo {
+pub(crate) struct CachedLookupInfo {
     /// Search by this key.
     endpoint_id: EndpointIdInt,
     lookup_type: LookupType,
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 07fad56643..8bb482f7c6 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -39,7 +39,7 @@ use super::{common::Cached, *};
 ///
 /// * It's possible for an entry that has not yet expired entry to be evicted
 ///   before expired items. That's a bit wasteful, but probably fine in practice.
-pub struct TimedLru<K, V> {
+pub(crate) struct TimedLru<K, V> {
     /// Cache's name for tracing.
     name: &'static str,
 
@@ -72,7 +72,7 @@ struct Entry<T> {
 
 impl<K: Hash + Eq, V> TimedLru<K, V> {
     /// Construct a new LRU cache with timed entries.
-    pub fn new(
+    pub(crate) fn new(
         name: &'static str,
         capacity: usize,
         ttl: Duration,
@@ -207,11 +207,11 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
-    pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
+    pub(crate) fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
         self.insert_raw_ttl(key, value, ttl, false);
     }
 
-    pub fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
+    pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
         let (created_at, old) = self.insert_raw(key.clone(), value);
 
         let cached = Cached {
@@ -221,22 +221,11 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
 
         (old, cached)
     }
-
-    pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
-        let (created_at, old) = self.insert_raw(key.clone(), value.clone());
-
-        let cached = Cached {
-            token: Some((self, LookupInfo { created_at, key })),
-            value,
-        };
-
-        (old, cached)
-    }
 }
 
 impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
     /// Retrieve a cached entry in convenient wrapper.
-    pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
+    pub(crate) fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
     where
         K: Borrow<Q> + Clone,
         Q: Hash + Eq + ?Sized,
@@ -253,32 +242,10 @@ impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
             }
         })
     }
-
-    /// Retrieve a cached entry in convenient wrapper, ignoring its TTL.
-    pub fn get_ignoring_ttl<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
-    where
-        K: Borrow<Q>,
-        Q: Hash + Eq + ?Sized,
-    {
-        let mut cache = self.cache.lock();
-        cache
-            .get(key)
-            .map(|entry| Cached::new_uncached(entry.value.clone()))
-    }
-
-    /// Remove an entry from the cache.
-    pub fn remove<Q>(&self, key: &Q) -> Option<V>
-    where
-        K: Borrow<Q> + Clone,
-        Q: Hash + Eq + ?Sized,
-    {
-        let mut cache = self.cache.lock();
-        cache.remove(key).map(|entry| entry.value)
-    }
 }
 
 /// Lookup information for key invalidation.
-pub struct LookupInfo<K> {
+pub(crate) struct LookupInfo<K> {
     /// Time of creation of a cache [`Entry`].
     /// We use this during invalidation lookups to prevent eviction of a newer
     /// entry sharing the same key (it might've been inserted by a different
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index ea8f7b4070..71a2a16af8 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -18,7 +18,7 @@ use crate::{
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
-pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
+pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
 
 /// Enables serving `CancelRequest`s.
 ///
@@ -32,7 +32,7 @@ pub struct CancellationHandler<P> {
 }
 
 #[derive(Debug, Error)]
-pub enum CancelError {
+pub(crate) enum CancelError {
     #[error("{0}")]
     IO(#[from] std::io::Error),
     #[error("{0}")]
@@ -53,7 +53,7 @@ impl ReportableError for CancelError {
 
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub fn get_session(self: Arc<Self>) -> Session<P> {
+    pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
         // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
@@ -81,7 +81,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
     }
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
-    pub async fn cancel_session(
+    pub(crate) async fn cancel_session(
         &self,
         key: CancelKeyData,
         session_id: Uuid,
@@ -155,14 +155,14 @@ pub struct CancelClosure {
 }
 
 impl CancelClosure {
-    pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
+    pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
         Self {
             socket_addr,
             cancel_token,
         }
     }
     /// Cancels the query running on user's compute node.
-    pub async fn try_cancel_query(self) -> Result<(), CancelError> {
+    pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
         info!("query was cancelled");
@@ -171,7 +171,7 @@ impl CancelClosure {
 }
 
 /// Helper for registering query cancellation tokens.
-pub struct Session<P> {
+pub(crate) struct Session<P> {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
@@ -181,7 +181,7 @@ pub struct Session<P> {
 impl<P> Session<P> {
     /// Store the cancel token for the given session.
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
+    pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
         self.cancellation_handler
             .map
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b6659f5dd0..246501a21e 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -23,7 +23,7 @@ use tracing::{error, info, warn};
 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
 #[derive(Debug, Error)]
-pub enum ConnectionError {
+pub(crate) enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
     /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
     #[error("{COULD_NOT_CONNECT}: {0}")]
@@ -86,22 +86,22 @@ impl ReportableError for ConnectionError {
 }
 
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
-pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
+pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 
 /// A config for establishing a connection to compute node.
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[derive(Clone, Default)]
-pub struct ConnCfg(Box<tokio_postgres::Config>);
+pub(crate) struct ConnCfg(Box<tokio_postgres::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         Self::default()
     }
 
     /// Reuse password or auth keys from the other config.
-    pub fn reuse_password(&mut self, other: Self) {
+    pub(crate) fn reuse_password(&mut self, other: Self) {
         if let Some(password) = other.get_password() {
             self.password(password);
         }
@@ -111,7 +111,7 @@ impl ConnCfg {
         }
     }
 
-    pub fn get_host(&self) -> Result<Host, WakeComputeError> {
+    pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
         match self.0.get_hosts() {
             [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
             // we should not have multiple address or unix addresses.
@@ -122,7 +122,7 @@ impl ConnCfg {
     }
 
     /// Apply startup message params to the connection config.
-    pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
+    pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
         // Link auth flow takes username from the console's response.
         if let (None, Some(user)) = (self.get_user(), params.get("user")) {
@@ -255,25 +255,25 @@ impl ConnCfg {
     }
 }
 
-pub struct PostgresConnection {
+pub(crate) struct PostgresConnection {
     /// Socket connected to a compute node.
-    pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
+    pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
         tokio::net::TcpStream,
         tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
     >,
     /// PostgreSQL connection parameters.
-    pub params: std::collections::HashMap<String, String>,
+    pub(crate) params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
-    pub cancel_closure: CancelClosure,
+    pub(crate) cancel_closure: CancelClosure,
     /// Labels for proxy's metrics.
-    pub aux: MetricsAuxInfo,
+    pub(crate) aux: MetricsAuxInfo,
 
     _guage: NumDbConnectionsGuard<'static>,
 }
 
 impl ConnCfg {
     /// Connect to a corresponding compute node.
-    pub async fn connect(
+    pub(crate) async fn connect(
         &self,
         ctx: &RequestMonitoring,
         allow_self_signed_compute: bool,
diff --git a/proxy/src/console.rs b/proxy/src/console.rs
index ea95e83437..87d8e781aa 100644
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -10,7 +10,7 @@ pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
 
 /// Various cache-related types.
 pub mod caches {
-    pub use super::provider::{ApiCaches, NodeInfoCache};
+    pub use super::provider::ApiCaches;
 }
 
 /// Various cache-related types.
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index a7ccf076b0..0df1a450ac 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -12,22 +12,22 @@ use crate::RoleName;
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize, Clone)]
-pub struct ConsoleError {
-    pub error: Box<str>,
+pub(crate) struct ConsoleError {
+    pub(crate) error: Box<str>,
     #[serde(skip)]
-    pub http_status_code: http::StatusCode,
-    pub status: Option<Status>,
+    pub(crate) http_status_code: http::StatusCode,
+    pub(crate) status: Option<Status>,
 }
 
 impl ConsoleError {
-    pub fn get_reason(&self) -> Reason {
+    pub(crate) fn get_reason(&self) -> Reason {
         self.status
             .as_ref()
             .and_then(|s| s.details.error_info.as_ref())
             .map_or(Reason::Unknown, |e| e.reason)
     }
 
-    pub fn get_user_facing_message(&self) -> String {
+    pub(crate) fn get_user_facing_message(&self) -> String {
         use super::provider::errors::REQUEST_FAILED;
         self.status
             .as_ref()
@@ -88,27 +88,28 @@ impl CouldRetry for ConsoleError {
 }
 
 #[derive(Debug, Deserialize, Clone)]
-pub struct Status {
-    pub code: Box<str>,
-    pub message: Box<str>,
-    pub details: Details,
+#[allow(dead_code)]
+pub(crate) struct Status {
+    pub(crate) code: Box<str>,
+    pub(crate) message: Box<str>,
+    pub(crate) details: Details,
 }
 
 #[derive(Debug, Deserialize, Clone)]
-pub struct Details {
-    pub error_info: Option<ErrorInfo>,
-    pub retry_info: Option<RetryInfo>,
-    pub user_facing_message: Option<UserFacingMessage>,
+pub(crate) struct Details {
+    pub(crate) error_info: Option<ErrorInfo>,
+    pub(crate) retry_info: Option<RetryInfo>,
+    pub(crate) user_facing_message: Option<UserFacingMessage>,
 }
 
 #[derive(Copy, Clone, Debug, Deserialize)]
-pub struct ErrorInfo {
-    pub reason: Reason,
+pub(crate) struct ErrorInfo {
+    pub(crate) reason: Reason,
     // Schema could also have `metadata` field, but it's not structured. Skip it for now.
 }
 
 #[derive(Clone, Copy, Debug, Deserialize, Default)]
-pub enum Reason {
+pub(crate) enum Reason {
     /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
     #[serde(rename = "ROLE_PROTECTED")]
     RoleProtected,
@@ -168,7 +169,7 @@ pub enum Reason {
 }
 
 impl Reason {
-    pub fn is_not_found(&self) -> bool {
+    pub(crate) fn is_not_found(self) -> bool {
         matches!(
             self,
             Reason::ResourceNotFound
@@ -178,7 +179,7 @@ impl Reason {
         )
     }
 
-    pub fn can_retry(&self) -> bool {
+    pub(crate) fn can_retry(self) -> bool {
         match self {
             // do not retry role protected errors
             // not a transitive error
@@ -208,22 +209,23 @@ impl Reason {
 }
 
 #[derive(Copy, Clone, Debug, Deserialize)]
-pub struct RetryInfo {
-    pub retry_delay_ms: u64,
+#[allow(dead_code)]
+pub(crate) struct RetryInfo {
+    pub(crate) retry_delay_ms: u64,
 }
 
 #[derive(Debug, Deserialize, Clone)]
-pub struct UserFacingMessage {
-    pub message: Box<str>,
+pub(crate) struct UserFacingMessage {
+    pub(crate) message: Box<str>,
 }
 
 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
 /// Returned by the `/proxy_get_role_secret` API method.
 #[derive(Deserialize)]
-pub struct GetRoleSecret {
-    pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<IpPattern>>,
-    pub project_id: Option<ProjectIdInt>,
+pub(crate) struct GetRoleSecret {
+    pub(crate) role_secret: Box<str>,
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) project_id: Option<ProjectIdInt>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -236,21 +238,21 @@ impl fmt::Debug for GetRoleSecret {
 /// Response which holds compute node's `host:port` pair.
 /// Returned by the `/proxy_wake_compute` API method.
 #[derive(Debug, Deserialize)]
-pub struct WakeCompute {
-    pub address: Box<str>,
-    pub aux: MetricsAuxInfo,
+pub(crate) struct WakeCompute {
+    pub(crate) address: Box<str>,
+    pub(crate) aux: MetricsAuxInfo,
 }
 
 /// Async response which concludes the link auth flow.
 /// Also known as `kickResponse` in the console.
 #[derive(Debug, Deserialize)]
-pub struct KickSession<'a> {
+pub(crate) struct KickSession<'a> {
     /// Session ID is assigned by the proxy.
-    pub session_id: &'a str,
+    pub(crate) session_id: &'a str,
 
     /// Compute node connection params.
     #[serde(deserialize_with = "KickSession::parse_db_info")]
-    pub result: DatabaseInfo,
+    pub(crate) result: DatabaseInfo,
 }
 
 impl KickSession<'_> {
@@ -273,15 +275,15 @@ impl KickSession<'_> {
 
 /// Compute node connection params.
 #[derive(Deserialize)]
-pub struct DatabaseInfo {
-    pub host: Box<str>,
-    pub port: u16,
-    pub dbname: Box<str>,
-    pub user: Box<str>,
+pub(crate) struct DatabaseInfo {
+    pub(crate) host: Box<str>,
+    pub(crate) port: u16,
+    pub(crate) dbname: Box<str>,
+    pub(crate) user: Box<str>,
     /// Console always provides a password, but it might
     /// be inconvenient for debug with local PG instance.
-    pub password: Option<Box<str>>,
-    pub aux: MetricsAuxInfo,
+    pub(crate) password: Option<Box<str>>,
+    pub(crate) aux: MetricsAuxInfo,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -299,12 +301,12 @@ impl fmt::Debug for DatabaseInfo {
 /// Various labels for prometheus metrics.
 /// Also known as `ProxyMetricsAuxInfo` in the console.
 #[derive(Debug, Deserialize, Clone)]
-pub struct MetricsAuxInfo {
-    pub endpoint_id: EndpointIdInt,
-    pub project_id: ProjectIdInt,
-    pub branch_id: BranchIdInt,
+pub(crate) struct MetricsAuxInfo {
+    pub(crate) endpoint_id: EndpointIdInt,
+    pub(crate) project_id: ProjectIdInt,
+    pub(crate) branch_id: BranchIdInt,
     #[serde(default)]
-    pub cold_start_info: ColdStartInfo,
+    pub(crate) cold_start_info: ColdStartInfo,
 }
 
 #[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
@@ -331,7 +333,7 @@ pub enum ColdStartInfo {
 }
 
 impl ColdStartInfo {
-    pub fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(self) -> &'static str {
         match self {
             ColdStartInfo::Unknown => "unknown",
             ColdStartInfo::Warm => "warm",
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 82d5033aab..f318ac529b 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -14,13 +14,13 @@ use tracing::{error, info, info_span, Instrument};
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
 
 /// Give caller an opportunity to wait for the cloud's reply.
-pub fn get_waiter(
+pub(crate) fn get_waiter(
     psql_session_id: impl Into<String>,
 ) -> Result<Waiter<'static, ComputeReady>, waiters::RegisterError> {
     CPLANE_WAITERS.register(psql_session_id.into())
 }
 
-pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
+pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
     CPLANE_WAITERS.notify(psql_session_id, msg)
 }
 
@@ -74,7 +74,7 @@ async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
 }
 
 /// A message received by `mgmt` when a compute node is ready.
-pub type ComputeReady = DatabaseInfo;
+pub(crate) type ComputeReady = DatabaseInfo;
 
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 4794527410..12a6e2f12a 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -23,7 +23,7 @@ use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tracing::info;
 
-pub mod errors {
+pub(crate) mod errors {
     use crate::{
         console::messages::{self, ConsoleError, Reason},
         error::{io_error, ErrorKind, ReportableError, UserFacingError},
@@ -34,11 +34,11 @@ pub mod errors {
     use super::ApiLockError;
 
     /// A go-to error message which doesn't leak any detail.
-    pub const REQUEST_FAILED: &str = "Console request failed";
+    pub(crate) const REQUEST_FAILED: &str = "Console request failed";
 
     /// Common console API error.
     #[derive(Debug, Error)]
-    pub enum ApiError {
+    pub(crate) enum ApiError {
         /// Error returned by the console itself.
         #[error("{REQUEST_FAILED} with {0}")]
         Console(ConsoleError),
@@ -50,7 +50,7 @@ pub mod errors {
 
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
-        pub fn get_reason(&self) -> messages::Reason {
+        pub(crate) fn get_reason(&self) -> messages::Reason {
             match self {
                 ApiError::Console(e) => e.get_reason(),
                 ApiError::Transport(_) => messages::Reason::Unknown,
@@ -146,7 +146,7 @@ pub mod errors {
     }
 
     #[derive(Debug, Error)]
-    pub enum GetAuthInfoError {
+    pub(crate) enum GetAuthInfoError {
         // We shouldn't include the actual secret here.
         #[error("Console responded with a malformed auth secret")]
         BadSecret,
@@ -183,7 +183,7 @@ pub mod errors {
     }
 
     #[derive(Debug, Error)]
-    pub enum WakeComputeError {
+    pub(crate) enum WakeComputeError {
         #[error("Console responded with a malformed compute address: {0}")]
         BadComputeAddress(Box<str>),
 
@@ -247,7 +247,7 @@ pub mod errors {
 
 /// Auth secret which is managed by the cloud.
 #[derive(Clone, Eq, PartialEq, Debug)]
-pub enum AuthSecret {
+pub(crate) enum AuthSecret {
     #[cfg(any(test, feature = "testing"))]
     /// Md5 hash of user's password.
     Md5([u8; 16]),
@@ -257,32 +257,32 @@ pub enum AuthSecret {
 }
 
 #[derive(Default)]
-pub struct AuthInfo {
-    pub secret: Option<AuthSecret>,
+pub(crate) struct AuthInfo {
+    pub(crate) secret: Option<AuthSecret>,
     /// List of IP addresses allowed for the autorization.
-    pub allowed_ips: Vec<IpPattern>,
+    pub(crate) allowed_ips: Vec<IpPattern>,
     /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<ProjectIdInt>,
+    pub(crate) project_id: Option<ProjectIdInt>,
 }
 
 /// Info for establishing a connection to a compute node.
 /// This is what we get after auth succeeded, but not before!
 #[derive(Clone)]
-pub struct NodeInfo {
+pub(crate) struct NodeInfo {
     /// Compute node connection params.
     /// It's sad that we have to clone this, but this will improve
     /// once we migrate to a bespoke connection logic.
-    pub config: compute::ConnCfg,
+    pub(crate) config: compute::ConnCfg,
 
     /// Labels for proxy's metrics.
-    pub aux: MetricsAuxInfo,
+    pub(crate) aux: MetricsAuxInfo,
 
     /// Whether we should accept self-signed certificates (for testing)
-    pub allow_self_signed_compute: bool,
+    pub(crate) allow_self_signed_compute: bool,
 }
 
 impl NodeInfo {
-    pub async fn connect(
+    pub(crate) async fn connect(
         &self,
         ctx: &RequestMonitoring,
         timeout: Duration,
@@ -296,12 +296,12 @@ impl NodeInfo {
             )
             .await
     }
-    pub fn reuse_settings(&mut self, other: Self) {
+    pub(crate) fn reuse_settings(&mut self, other: Self) {
         self.allow_self_signed_compute = other.allow_self_signed_compute;
         self.config.reuse_password(other.config);
     }
 
-    pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
+    pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
         match keys {
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
@@ -310,10 +310,10 @@ impl NodeInfo {
     }
 }
 
-pub type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
-pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
-pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
-pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+pub(crate) type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
+pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
+pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
+pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -350,6 +350,7 @@ pub enum ConsoleBackend {
     Postgres(mock::Api),
     /// Internal testing
     #[cfg(test)]
+    #[allow(private_interfaces)]
     Test(Box<dyn crate::auth::backend::TestBackend>),
 }
 
@@ -402,7 +403,7 @@ impl Api for ConsoleBackend {
 /// Various caches for [`console`](super).
 pub struct ApiCaches {
     /// Cache for the `wake_compute` API method.
-    pub node_info: NodeInfoCache,
+    pub(crate) node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
     /// List of all valid endpoints.
@@ -439,7 +440,7 @@ pub struct ApiLocks<K> {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum ApiLockError {
+pub(crate) enum ApiLockError {
     #[error("timeout acquiring resource permit")]
     TimeoutError(#[from] tokio::time::error::Elapsed),
 }
@@ -471,7 +472,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         })
     }
 
-    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
+    pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
         if self.config.initial_limit == 0 {
             return Ok(WakeComputePermit {
                 permit: Token::disabled(),
@@ -531,18 +532,18 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
     }
 }
 
-pub struct WakeComputePermit {
+pub(crate) struct WakeComputePermit {
     permit: Token,
 }
 
 impl WakeComputePermit {
-    pub fn should_check_cache(&self) -> bool {
+    pub(crate) fn should_check_cache(&self) -> bool {
         !self.permit.is_disabled()
     }
-    pub fn release(self, outcome: Outcome) {
+    pub(crate) fn release(self, outcome: Outcome) {
         self.permit.release(outcome);
     }
-    pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
+    pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
         match res {
             Ok(_) => self.release(Outcome::Success),
             Err(_) => self.release(Outcome::Overload),
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 4e8b7a9365..08b87cd87a 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -48,7 +48,7 @@ impl Api {
         Self { endpoint }
     }
 
-    pub fn url(&self) -> &str {
+    pub(crate) fn url(&self) -> &str {
         self.endpoint.as_str()
     }
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index a6c0e233fc..33eda72e65 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -25,8 +25,8 @@ use tracing::{debug, error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
+    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     jwt: String,
 }
 
@@ -51,7 +51,7 @@ impl Api {
         }
     }
 
-    pub fn url(&self) -> &str {
+    pub(crate) fn url(&self) -> &str {
         self.endpoint.url().as_str()
     }
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index cafbdedc15..9edba543fe 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -22,8 +22,9 @@ use self::parquet::RequestData;
 
 pub mod parquet;
 
-pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
-pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub(crate) static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub(crate) static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> =
+    OnceCell::new();
 
 /// Context data for a single request to connect to a database.
 ///
@@ -38,12 +39,12 @@ pub struct RequestMonitoring(
 );
 
 struct RequestMonitoringInner {
-    pub peer_addr: IpAddr,
-    pub session_id: Uuid,
-    pub protocol: Protocol,
+    pub(crate) peer_addr: IpAddr,
+    pub(crate) session_id: Uuid,
+    pub(crate) protocol: Protocol,
     first_packet: chrono::DateTime<Utc>,
     region: &'static str,
-    pub span: Span,
+    pub(crate) span: Span,
 
     // filled in as they are discovered
     project: Option<ProjectIdInt>,
@@ -63,14 +64,14 @@ struct RequestMonitoringInner {
     sender: Option<mpsc::UnboundedSender<RequestData>>,
     // This sender is only used to log the length of session in case of success.
     disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
-    pub latency_timer: LatencyTimer,
+    pub(crate) latency_timer: LatencyTimer,
     // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
     rejected: Option<bool>,
     disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }
 
 #[derive(Clone, Debug)]
-pub enum AuthMethod {
+pub(crate) enum AuthMethod {
     // aka link aka passwordless
     Web,
     ScramSha256,
@@ -125,11 +126,11 @@ impl RequestMonitoring {
     }
 
     #[cfg(test)]
-    pub fn test() -> Self {
+    pub(crate) fn test() -> Self {
         RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
     }
 
-    pub fn console_application_name(&self) -> String {
+    pub(crate) fn console_application_name(&self) -> String {
         let this = self.0.try_lock().expect("should not deadlock");
         format!(
             "{}/{}",
@@ -138,19 +139,19 @@ impl RequestMonitoring {
         )
     }
 
-    pub fn set_rejected(&self, rejected: bool) {
+    pub(crate) fn set_rejected(&self, rejected: bool) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.rejected = Some(rejected);
     }
 
-    pub fn set_cold_start_info(&self, info: ColdStartInfo) {
+    pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_cold_start_info(info);
     }
 
-    pub fn set_db_options(&self, options: StartupMessageParams) {
+    pub(crate) fn set_db_options(&self, options: StartupMessageParams) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.set_application(options.get("application_name").map(SmolStr::from));
         if let Some(user) = options.get("user") {
@@ -163,7 +164,7 @@ impl RequestMonitoring {
         this.pg_options = Some(options);
     }
 
-    pub fn set_project(&self, x: MetricsAuxInfo) {
+    pub(crate) fn set_project(&self, x: MetricsAuxInfo) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         if this.endpoint_id.is_none() {
             this.set_endpoint_id(x.endpoint_id.as_str().into());
@@ -173,33 +174,33 @@ impl RequestMonitoring {
         this.set_cold_start_info(x.cold_start_info);
     }
 
-    pub fn set_project_id(&self, project_id: ProjectIdInt) {
+    pub(crate) fn set_project_id(&self, project_id: ProjectIdInt) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.project = Some(project_id);
     }
 
-    pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
+    pub(crate) fn set_endpoint_id(&self, endpoint_id: EndpointId) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_endpoint_id(endpoint_id);
     }
 
-    pub fn set_dbname(&self, dbname: DbName) {
+    pub(crate) fn set_dbname(&self, dbname: DbName) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_dbname(dbname);
     }
 
-    pub fn set_user(&self, user: RoleName) {
+    pub(crate) fn set_user(&self, user: RoleName) {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .set_user(user);
     }
 
-    pub fn set_auth_method(&self, auth_method: AuthMethod) {
+    pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         this.auth_method = Some(auth_method);
     }
@@ -211,7 +212,7 @@ impl RequestMonitoring {
             .has_private_peer_addr()
     }
 
-    pub fn set_error_kind(&self, kind: ErrorKind) {
+    pub(crate) fn set_error_kind(&self, kind: ErrorKind) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         // Do not record errors from the private address to metrics.
         if !this.has_private_peer_addr() {
@@ -237,30 +238,30 @@ impl RequestMonitoring {
             .log_connect();
     }
 
-    pub fn protocol(&self) -> Protocol {
+    pub(crate) fn protocol(&self) -> Protocol {
         self.0.try_lock().expect("should not deadlock").protocol
     }
 
-    pub fn span(&self) -> Span {
+    pub(crate) fn span(&self) -> Span {
         self.0.try_lock().expect("should not deadlock").span.clone()
     }
 
-    pub fn session_id(&self) -> Uuid {
+    pub(crate) fn session_id(&self) -> Uuid {
         self.0.try_lock().expect("should not deadlock").session_id
     }
 
-    pub fn peer_addr(&self) -> IpAddr {
+    pub(crate) fn peer_addr(&self) -> IpAddr {
         self.0.try_lock().expect("should not deadlock").peer_addr
     }
 
-    pub fn cold_start_info(&self) -> ColdStartInfo {
+    pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
         self.0
             .try_lock()
             .expect("should not deadlock")
             .cold_start_info
     }
 
-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
+    pub(crate) fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
         LatencyTimerPause {
             ctx: self,
             start: tokio::time::Instant::now(),
@@ -268,7 +269,7 @@ impl RequestMonitoring {
         }
     }
 
-    pub fn success(&self) {
+    pub(crate) fn success(&self) {
         self.0
             .try_lock()
             .expect("should not deadlock")
@@ -277,7 +278,7 @@ impl RequestMonitoring {
     }
 }
 
-pub struct LatencyTimerPause<'a> {
+pub(crate) struct LatencyTimerPause<'a> {
     ctx: &'a RequestMonitoring,
     start: tokio::time::Instant,
     waiting_for: Waiting,
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e5962b35fa..88caa9a316 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -62,8 +62,8 @@ pub struct ParquetUploadArgs {
 // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_UPLOAD_RETRIES times, we give up
-pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
-pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 
 // the parquet crate leaves a lot to be desired...
 // what follows is an attempt to write parquet files with minimal allocs.
@@ -73,7 +73,7 @@ pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 // * after each rowgroup write, we check the length of the file and upload to s3 if large enough
 
 #[derive(parquet_derive::ParquetRecordWriter)]
-pub struct RequestData {
+pub(crate) struct RequestData {
     region: &'static str,
     protocol: &'static str,
     /// Must be UTC. The derive macro doesn't like the timezones
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index fdfe50a494..53f9f75c5b 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -3,12 +3,12 @@ use std::{error::Error as StdError, fmt, io};
 use measured::FixedCardinalityLabel;
 
 /// Upcast (almost) any error into an opaque [`io::Error`].
-pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
+pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
     io::Error::new(io::ErrorKind::Other, e)
 }
 
 /// A small combinator for pluggable error logging.
-pub fn log_error<E: fmt::Display>(e: E) -> E {
+pub(crate) fn log_error<E: fmt::Display>(e: E) -> E {
     tracing::error!("{e}");
     e
 }
@@ -19,7 +19,7 @@ pub fn log_error<E: fmt::Display>(e: E) -> E {
 /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
 /// is way too convenient and tends to proliferate all across the codebase,
 /// ultimately leading to accidental leaks of sensitive data.
-pub trait UserFacingError: ReportableError {
+pub(crate) trait UserFacingError: ReportableError {
     /// Format the error for client, stripping all sensitive info.
     ///
     /// Although this might be a no-op for many types, it's highly
@@ -64,7 +64,7 @@ pub enum ErrorKind {
 }
 
 impl ErrorKind {
-    pub fn to_metric_label(&self) -> &'static str {
+    pub(crate) fn to_metric_label(self) -> &'static str {
         match self {
             ErrorKind::User => "user",
             ErrorKind::ClientDisconnect => "clientdisconnect",
@@ -78,7 +78,7 @@ impl ErrorKind {
     }
 }
 
-pub trait ReportableError: fmt::Display + Send + 'static {
+pub(crate) trait ReportableError: fmt::Display + Send + 'static {
     fn get_error_kind(&self) -> ErrorKind;
 }
 
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 1f1dd8c415..fee634f67f 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -12,9 +12,9 @@ use http_body_util::BodyExt;
 use hyper1::body::Body;
 use serde::de::DeserializeOwned;
 
-pub use reqwest::{Request, Response, StatusCode};
-pub use reqwest_middleware::{ClientWithMiddleware, Error};
-pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+pub(crate) use reqwest::{Request, Response};
+pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
+pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 
 use crate::{
     metrics::{ConsoleRequest, Metrics},
@@ -35,7 +35,7 @@ pub fn new_client() -> ClientWithMiddleware {
         .build()
 }
 
-pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
         .timeout(default_timout)
         .build()
@@ -77,20 +77,20 @@ impl Endpoint {
     }
 
     #[inline(always)]
-    pub fn url(&self) -> &ApiUrl {
+    pub(crate) fn url(&self) -> &ApiUrl {
         &self.endpoint
     }
 
     /// Return a [builder](RequestBuilder) for a `GET` request,
     /// appending a single `path` segment to the base endpoint URL.
-    pub fn get(&self, path: &str) -> RequestBuilder {
+    pub(crate) fn get(&self, path: &str) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         url.path_segments_mut().push(path);
         self.client.get(url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
-    pub async fn execute(&self, request: Request) -> Result<Response, Error> {
+    pub(crate) async fn execute(&self, request: Request) -> Result<Response, Error> {
         let _timer = Metrics::get()
             .proxy
             .console_request_latency
@@ -102,7 +102,7 @@ impl Endpoint {
     }
 }
 
-pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
+pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
     mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
     limit: usize,
 ) -> anyhow::Result<D> {
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index d418caa511..e5144cfe2e 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -29,10 +29,10 @@ impl<Id: InternId> std::fmt::Display for InternedString<Id> {
 }
 
 impl<Id: InternId> InternedString<Id> {
-    pub fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(&self) -> &'static str {
         Id::get_interner().inner.resolve(&self.inner)
     }
-    pub fn get(s: &str) -> Option<Self> {
+    pub(crate) fn get(s: &str) -> Option<Self> {
         Id::get_interner().get(s)
     }
 }
@@ -78,7 +78,7 @@ impl<Id: InternId> serde::Serialize for InternedString<Id> {
 }
 
 impl<Id: InternId> StringInterner<Id> {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         StringInterner {
             inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
                 Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
@@ -90,26 +90,24 @@ impl<Id: InternId> StringInterner<Id> {
         }
     }
 
-    pub fn is_empty(&self) -> bool {
-        self.inner.is_empty()
-    }
-
-    pub fn len(&self) -> usize {
+    #[cfg(test)]
+    fn len(&self) -> usize {
         self.inner.len()
     }
 
-    pub fn current_memory_usage(&self) -> usize {
+    #[cfg(test)]
+    fn current_memory_usage(&self) -> usize {
         self.inner.current_memory_usage()
     }
 
-    pub fn get_or_intern(&self, s: &str) -> InternedString<Id> {
+    pub(crate) fn get_or_intern(&self, s: &str) -> InternedString<Id> {
         InternedString {
             inner: self.inner.get_or_intern(s),
             _id: PhantomData,
         }
     }
 
-    pub fn get(&self, s: &str) -> Option<InternedString<Id>> {
+    pub(crate) fn get(&self, s: &str) -> Option<InternedString<Id>> {
         Some(InternedString {
             inner: self.inner.get(s)?,
             _id: PhantomData,
@@ -132,14 +130,14 @@ impl<Id: InternId> Default for StringInterner<Id> {
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct RoleNameTag;
+pub(crate) struct RoleNameTag;
 impl InternId for RoleNameTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
-pub type RoleNameInt = InternedString<RoleNameTag>;
+pub(crate) type RoleNameInt = InternedString<RoleNameTag>;
 impl From<&RoleName> for RoleNameInt {
     fn from(value: &RoleName) -> Self {
         RoleNameTag::get_interner().get_or_intern(value)
@@ -150,7 +148,7 @@ impl From<&RoleName> for RoleNameInt {
 pub struct EndpointIdTag;
 impl InternId for EndpointIdTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
@@ -170,7 +168,7 @@ impl From<EndpointId> for EndpointIdInt {
 pub struct BranchIdTag;
 impl InternId for BranchIdTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
@@ -190,7 +188,7 @@ impl From<BranchId> for BranchIdInt {
 pub struct ProjectIdTag;
 impl InternId for ProjectIdTag {
     fn get_interner() -> &'static StringInterner<Self> {
-        pub static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
+        static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
         ROLE_NAMES.get_or_init(Default::default)
     }
 }
@@ -217,7 +215,7 @@ mod tests {
     struct MyId;
     impl InternId for MyId {
         fn get_interner() -> &'static StringInterner<Self> {
-            pub static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
+            pub(crate) static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
             ROLE_NAMES.get_or_init(Default::default)
         }
     }
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 1e14ca59ec..8d7e586b3d 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -157,7 +157,8 @@ macro_rules! smol_str_wrapper {
         pub struct $name(smol_str::SmolStr);
 
         impl $name {
-            pub fn as_str(&self) -> &str {
+            #[allow(unused)]
+            pub(crate) fn as_str(&self) -> &str {
                 self.0.as_str()
             }
         }
@@ -252,19 +253,19 @@ smol_str_wrapper!(Host);
 
 // Endpoints are a bit tricky. Rare they might be branches or projects.
 impl EndpointId {
-    pub fn is_endpoint(&self) -> bool {
+    pub(crate) fn is_endpoint(&self) -> bool {
         self.0.starts_with("ep-")
     }
-    pub fn is_branch(&self) -> bool {
+    pub(crate) fn is_branch(&self) -> bool {
         self.0.starts_with("br-")
     }
-    pub fn is_project(&self) -> bool {
-        !self.is_endpoint() && !self.is_branch()
-    }
-    pub fn as_branch(&self) -> BranchId {
+    // pub(crate) fn is_project(&self) -> bool {
+    //     !self.is_endpoint() && !self.is_branch()
+    // }
+    pub(crate) fn as_branch(&self) -> BranchId {
         BranchId(self.0.clone())
     }
-    pub fn as_project(&self) -> ProjectId {
+    pub(crate) fn as_project(&self) -> ProjectId {
         ProjectId(self.0.clone())
     }
 }
diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs
index 0d03574901..8c0f251066 100644
--- a/proxy/src/parse.rs
+++ b/proxy/src/parse.rs
@@ -2,14 +2,14 @@
 
 use std::ffi::CStr;
 
-pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
+pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
     let cstr = CStr::from_bytes_until_nul(bytes).ok()?;
     let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len());
     Some((cstr, other))
 }
 
 /// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
-pub fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
+pub(crate) fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
     (bytes.len() >= N).then(|| {
         let (head, tail) = bytes.split_at(N);
         (head.try_into().unwrap(), tail)
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 1dd4563514..17764f78d1 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -13,9 +13,9 @@ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 
 pin_project! {
     /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
-    pub struct ChainRW<T> {
+    pub(crate) struct ChainRW<T> {
         #[pin]
-        pub inner: T,
+        pub(crate) inner: T,
         buf: BytesMut,
     }
 }
@@ -60,7 +60,7 @@ const HEADER: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
-pub async fn read_proxy_protocol<T: AsyncRead + Unpin>(
+pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
     mut read: T,
 ) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
     let mut buf = BytesMut::with_capacity(128);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index aa1025a29f..ff199ac701 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,12 +1,12 @@
 #[cfg(test)]
 mod tests;
 
-pub mod connect_compute;
+pub(crate) mod connect_compute;
 mod copy_bidirectional;
-pub mod handshake;
-pub mod passthrough;
-pub mod retry;
-pub mod wake_compute;
+pub(crate) mod handshake;
+pub(crate) mod passthrough;
+pub(crate) mod retry;
+pub(crate) mod wake_compute;
 pub use copy_bidirectional::copy_bidirectional_client_compute;
 pub use copy_bidirectional::ErrorSource;
 
@@ -170,21 +170,21 @@ pub async fn task_main(
     Ok(())
 }
 
-pub enum ClientMode {
+pub(crate) enum ClientMode {
     Tcp,
     Websockets { hostname: Option<String> },
 }
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    pub fn allow_cleartext(&self) -> bool {
+    pub(crate) fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
             ClientMode::Websockets { .. } => true,
         }
     }
 
-    pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+    pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
         match self {
             ClientMode::Tcp => config.allow_self_signed_compute,
             ClientMode::Websockets { .. } => false,
@@ -213,7 +213,7 @@ impl ClientMode {
 // 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
 //    we cannot be sure the client even understands our error message
 // 3. PrepareClient: The client disconnected, so we can't tell them anyway...
-pub enum ClientRequestError {
+pub(crate) enum ClientRequestError {
     #[error("{0}")]
     Cancellation(#[from] cancellation::CancelError),
     #[error("{0}")]
@@ -238,7 +238,7 @@ impl ReportableError for ClientRequestError {
     }
 }
 
-pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
+pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &RequestMonitoring,
     cancellation_handler: Arc<CancellationHandlerMain>,
@@ -340,9 +340,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         client: stream,
         aux: node.aux.clone(),
         compute: node,
-        req: request_gauge,
-        conn: conn_gauge,
-        cancel: session,
+        _req: request_gauge,
+        _conn: conn_gauge,
+        _cancel: session,
     }))
 }
 
@@ -377,20 +377,20 @@ async fn prepare_client_connection<P>(
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
-pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);
+pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
 impl NeonOptions {
-    pub fn parse_params(params: &StartupMessageParams) -> Self {
+    pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
             .options_raw()
             .map(Self::parse_from_iter)
             .unwrap_or_default()
     }
-    pub fn parse_options_raw(options: &str) -> Self {
+    pub(crate) fn parse_options_raw(options: &str) -> Self {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
-    pub fn is_ephemeral(&self) -> bool {
+    pub(crate) fn is_ephemeral(&self) -> bool {
         // Currently, neon endpoint options are all reserved for ephemeral endpoints.
         !self.0.is_empty()
     }
@@ -404,7 +404,7 @@ impl NeonOptions {
         Self(options)
     }
 
-    pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
+    pub(crate) fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
         // prefix + format!(" {k}:{v}")
         // kinda jank because SmolStr is immutable
         std::iter::once(prefix)
@@ -415,7 +415,7 @@ impl NeonOptions {
 
     /// <https://swagger.io/docs/specification/serialization/> DeepObject format
     /// `paramName[prop1]=value1&paramName[prop2]=value2&...`
-    pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
+    pub(crate) fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
         self.0
             .iter()
             .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
@@ -423,7 +423,7 @@ impl NeonOptions {
     }
 }
 
-pub fn neon_option(bytes: &str) -> Option<(&str, &str)> {
+pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> {
     static RE: OnceCell<Regex> = OnceCell::new();
     let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 6305dc204e..613548d4a0 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -25,7 +25,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
+pub(crate) fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     let is_cached = node_info.cached();
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
@@ -41,7 +41,7 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
 }
 
 #[async_trait]
-pub trait ConnectMechanism {
+pub(crate) trait ConnectMechanism {
     type Connection;
     type ConnectError: ReportableError;
     type Error: From<Self::ConnectError>;
@@ -56,7 +56,7 @@ pub trait ConnectMechanism {
 }
 
 #[async_trait]
-pub trait ComputeConnectBackend {
+pub(crate) trait ComputeConnectBackend {
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
@@ -65,12 +65,12 @@ pub trait ComputeConnectBackend {
     fn get_keys(&self) -> &ComputeCredentialKeys;
 }
 
-pub struct TcpMechanism<'a> {
+pub(crate) struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
-    pub params: &'a StartupMessageParams,
+    pub(crate) params: &'a StartupMessageParams,
 
     /// connect_to_compute concurrency lock
-    pub locks: &'static ApiLocks<Host>,
+    pub(crate) locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
+pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &RequestMonitoring,
     mechanism: &M,
     user_info: &B,
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index f8c8e8bc4b..4ebda013ac 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -14,7 +14,7 @@ enum TransferState {
 }
 
 #[derive(Debug)]
-pub enum ErrorDirection {
+pub(crate) enum ErrorDirection {
     Read(io::Error),
     Write(io::Error),
 }
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 27a72f8072..5996b11c11 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -18,7 +18,7 @@ use crate::{
 };
 
 #[derive(Error, Debug)]
-pub enum HandshakeError {
+pub(crate) enum HandshakeError {
     #[error("data is sent before server replied with EncryptionResponse")]
     EarlyData,
 
@@ -57,7 +57,7 @@ impl ReportableError for HandshakeError {
     }
 }
 
-pub enum HandshakeData<S> {
+pub(crate) enum HandshakeData<S> {
     Startup(PqStream<Stream<S>>, StartupMessageParams),
     Cancel(CancelKeyData),
 }
@@ -67,7 +67,7 @@ pub enum HandshakeData<S> {
 /// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
-pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     ctx: &RequestMonitoring,
     stream: S,
     mut tls: Option<&TlsConfig>,
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 9942fac383..c17108de0a 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -14,7 +14,7 @@ use super::copy_bidirectional::ErrorSource;
 
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
-pub async fn proxy_pass(
+pub(crate) async fn proxy_pass(
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: MetricsAuxInfo,
@@ -57,18 +57,18 @@ pub async fn proxy_pass(
     Ok(())
 }
 
-pub struct ProxyPassthrough<P, S> {
-    pub client: Stream<S>,
-    pub compute: PostgresConnection,
-    pub aux: MetricsAuxInfo,
+pub(crate) struct ProxyPassthrough<P, S> {
+    pub(crate) client: Stream<S>,
+    pub(crate) compute: PostgresConnection,
+    pub(crate) aux: MetricsAuxInfo,
 
-    pub req: NumConnectionRequestsGuard<'static>,
-    pub conn: NumClientConnectionsGuard<'static>,
-    pub cancel: cancellation::Session<P>,
+    pub(crate) _req: NumConnectionRequestsGuard<'static>,
+    pub(crate) _conn: NumClientConnectionsGuard<'static>,
+    pub(crate) _cancel: cancellation::Session<P>,
 }
 
 impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
-    pub async fn proxy_pass(self) -> Result<(), ErrorSource> {
+    pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
         let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
         if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
             tracing::error!(?err, "could not cancel the query in the database");
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 644b183a91..15895d37e6 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -2,18 +2,18 @@ use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;
 
-pub trait CouldRetry {
+pub(crate) trait CouldRetry {
     /// Returns true if the error could be retried
     fn could_retry(&self) -> bool;
 }
 
-pub trait ShouldRetryWakeCompute {
+pub(crate) trait ShouldRetryWakeCompute {
     /// Returns true if we need to invalidate the cache for this node.
     /// If false, we can continue retrying with the current node cache.
     fn should_retry_wake_compute(&self) -> bool;
 }
 
-pub fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
+pub(crate) fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
     num_retries < config.max_retries && err.could_retry()
 }
 
@@ -101,7 +101,7 @@ impl ShouldRetryWakeCompute for compute::ConnectionError {
     }
 }
 
-pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
+pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
     config
         .base_delay
         .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 21c0641a7f..b3b284ef27 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -11,14 +11,14 @@ use crate::auth::backend::{
     ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
 use crate::config::{CertResolver, RetryConfig};
-use crate::console::caches::NodeInfoCache;
 use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status};
-use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
+use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend, NodeInfoCache};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
+use crate::{sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
+use http::StatusCode;
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
 use rustls::pki_types;
@@ -491,7 +491,7 @@ impl TestBackend for TestConnectMechanism {
             ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
                 let err = console::errors::ApiError::Console(ConsoleError {
-                    http_status_code: http::StatusCode::BAD_REQUEST,
+                    http_status_code: StatusCode::BAD_REQUEST,
                     error: "TEST".into(),
                     status: None,
                 });
@@ -500,7 +500,7 @@ impl TestBackend for TestConnectMechanism {
             }
             ConnectAction::WakeRetry => {
                 let err = console::errors::ApiError::Console(ConsoleError {
-                    http_status_code: http::StatusCode::BAD_REQUEST,
+                    http_status_code: StatusCode::BAD_REQUEST,
                     error: "TEST".into(),
                     status: Some(Status {
                         code: "error".into(),
@@ -525,9 +525,6 @@ impl TestBackend for TestConnectMechanism {
     {
         unimplemented!("not used in tests")
     }
-    fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
-        unimplemented!("not used in tests")
-    }
 }
 
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 71f07f4682..33a2162bc7 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -102,7 +102,7 @@ async fn proxy_mitm(
 }
 
 /// taken from tokio-postgres
-pub async fn connect_tls<S, T>(mut stream: S, tls: T) -> T::Stream
+pub(crate) async fn connect_tls<S, T>(mut stream: S, tls: T) -> T::Stream
 where
     S: AsyncRead + AsyncWrite + Unpin,
     T: TlsConnect<S>,
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 5b06e8f054..9b8ac6d29d 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -12,7 +12,7 @@ use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 
-pub async fn wake_compute<B: ComputeConnectBackend>(
+pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &RequestMonitoring,
     api: &B,
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 222cd431d2..e5f5867998 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,10 +1,16 @@
+mod leaky_bucket;
 mod limit_algorithm;
 mod limiter;
-pub use limit_algorithm::{
-    aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
+
+#[cfg(test)]
+pub(crate) use limit_algorithm::aimd::Aimd;
+
+pub(crate) use limit_algorithm::{
+    DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
-pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
-mod leaky_bucket;
+pub(crate) use limiter::GlobalRateLimiter;
+
 pub use leaky_bucket::{
     EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
 };
+pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index f184e18f4c..fa8cb75256 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -35,7 +35,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, key: K, n: u32) -> bool {
+    pub(crate) fn check(&self, key: K, n: u32) -> bool {
         let now = Instant::now();
 
         if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
@@ -73,8 +73,9 @@ pub struct LeakyBucketState {
     time: Instant,
 }
 
+#[cfg(test)]
 impl LeakyBucketConfig {
-    pub fn new(rps: f64, max: f64) -> Self {
+    pub(crate) fn new(rps: f64, max: f64) -> Self {
         assert!(rps > 0.0, "rps must be positive");
         assert!(max > 0.0, "max must be positive");
         Self { rps, max }
@@ -82,7 +83,7 @@ impl LeakyBucketConfig {
 }
 
 impl LeakyBucketState {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         Self {
             filled: 0.0,
             time: Instant::now(),
@@ -100,7 +101,7 @@ impl LeakyBucketState {
         self.filled == 0.0
     }
 
-    pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
+    pub(crate) fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
         self.update(info, now);
 
         if self.filled + n > info.max {
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index bc16837f65..25607b7e10 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -8,13 +8,13 @@ use tokio::{
 
 use self::aimd::Aimd;
 
-pub mod aimd;
+pub(crate) mod aimd;
 
 /// Whether a job succeeded or failed as a result of congestion/overload.
 ///
 /// Errors not considered to be caused by overload should be ignored.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Outcome {
+pub(crate) enum Outcome {
     /// The job succeeded, or failed in a way unrelated to overload.
     Success,
     /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
@@ -23,14 +23,14 @@ pub enum Outcome {
 }
 
 /// An algorithm for controlling a concurrency limit.
-pub trait LimitAlgorithm: Send + Sync + 'static {
+pub(crate) trait LimitAlgorithm: Send + Sync + 'static {
     /// Update the concurrency limit in response to a new job completion.
     fn update(&self, old_limit: usize, sample: Sample) -> usize;
 }
 
 /// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay).
 #[derive(Debug, Clone, PartialEq, Eq, Copy)]
-pub struct Sample {
+pub(crate) struct Sample {
     pub(crate) latency: Duration,
     /// Jobs in flight when the sample was taken.
     pub(crate) in_flight: usize,
@@ -39,7 +39,7 @@ pub struct Sample {
 
 #[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)]
 #[serde(rename_all = "snake_case")]
-pub enum RateLimitAlgorithm {
+pub(crate) enum RateLimitAlgorithm {
     #[default]
     Fixed,
     Aimd {
@@ -48,7 +48,7 @@ pub enum RateLimitAlgorithm {
     },
 }
 
-pub struct Fixed;
+pub(crate) struct Fixed;
 
 impl LimitAlgorithm for Fixed {
     fn update(&self, old_limit: usize, _sample: Sample) -> usize {
@@ -59,12 +59,12 @@ impl LimitAlgorithm for Fixed {
 #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
 pub struct RateLimiterConfig {
     #[serde(flatten)]
-    pub algorithm: RateLimitAlgorithm,
-    pub initial_limit: usize,
+    pub(crate) algorithm: RateLimitAlgorithm,
+    pub(crate) initial_limit: usize,
 }
 
 impl RateLimiterConfig {
-    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
+    pub(crate) fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
         match self.algorithm {
             RateLimitAlgorithm::Fixed => Box::new(Fixed),
             RateLimitAlgorithm::Aimd { conf } => Box::new(conf),
@@ -72,7 +72,7 @@ impl RateLimiterConfig {
     }
 }
 
-pub struct LimiterInner {
+pub(crate) struct LimiterInner {
     alg: Box<dyn LimitAlgorithm>,
     available: usize,
     limit: usize,
@@ -114,7 +114,7 @@ impl LimiterInner {
 ///
 /// The limit will be automatically adjusted based on observed latency (delay) and/or failures
 /// caused by overload (loss).
-pub struct DynamicLimiter {
+pub(crate) struct DynamicLimiter {
     config: RateLimiterConfig,
     inner: Mutex<LimiterInner>,
     // to notify when a token is available
@@ -124,7 +124,7 @@ pub struct DynamicLimiter {
 /// A concurrency token, required to run a job.
 ///
 /// Release the token back to the [`DynamicLimiter`] after the job is complete.
-pub struct Token {
+pub(crate) struct Token {
     start: Instant,
     limiter: Option<Arc<DynamicLimiter>>,
 }
@@ -133,14 +133,14 @@ pub struct Token {
 ///
 /// Not guaranteed to be consistent under high concurrency.
 #[derive(Debug, Clone, Copy)]
-pub struct LimiterState {
+#[cfg(test)]
+struct LimiterState {
     limit: usize,
-    in_flight: usize,
 }
 
 impl DynamicLimiter {
     /// Create a limiter with a given limit control algorithm.
-    pub fn new(config: RateLimiterConfig) -> Arc<Self> {
+    pub(crate) fn new(config: RateLimiterConfig) -> Arc<Self> {
         let ready = Notify::new();
         ready.notify_one();
 
@@ -157,7 +157,10 @@ impl DynamicLimiter {
     }
 
     /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
+    pub(crate) async fn acquire_timeout(
+        self: &Arc<Self>,
+        duration: Duration,
+    ) -> Result<Token, Elapsed> {
         tokio::time::timeout(duration, self.acquire()).await?
     }
 
@@ -208,12 +211,10 @@ impl DynamicLimiter {
     }
 
     /// The current state of the limiter.
-    pub fn state(&self) -> LimiterState {
+    #[cfg(test)]
+    fn state(&self) -> LimiterState {
         let inner = self.inner.lock();
-        LimiterState {
-            limit: inner.limit,
-            in_flight: inner.in_flight,
-        }
+        LimiterState { limit: inner.limit }
     }
 }
 
@@ -224,22 +225,22 @@ impl Token {
             limiter: Some(limiter),
         }
     }
-    pub fn disabled() -> Self {
+    pub(crate) fn disabled() -> Self {
         Self {
             start: Instant::now(),
             limiter: None,
         }
     }
 
-    pub fn is_disabled(&self) -> bool {
+    pub(crate) fn is_disabled(&self) -> bool {
         self.limiter.is_none()
     }
 
-    pub fn release(mut self, outcome: Outcome) {
+    pub(crate) fn release(mut self, outcome: Outcome) {
         self.release_mut(Some(outcome));
     }
 
-    pub fn release_mut(&mut self, outcome: Option<Outcome>) {
+    pub(crate) fn release_mut(&mut self, outcome: Option<Outcome>) {
         if let Some(limiter) = self.limiter.take() {
             limiter.release_inner(self.start, outcome);
         }
@@ -252,13 +253,10 @@ impl Drop for Token {
     }
 }
 
+#[cfg(test)]
 impl LimiterState {
     /// The current concurrency limit.
-    pub fn limit(&self) -> usize {
+    fn limit(self) -> usize {
         self.limit
     }
-    /// The number of jobs in flight.
-    pub fn in_flight(&self) -> usize {
-        self.in_flight
-    }
 }
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index d669492fa6..86b56e38fb 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -10,17 +10,17 @@ use super::{LimitAlgorithm, Outcome, Sample};
 ///
 /// Reduces available concurrency by a factor when load-based errors are detected.
 #[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
-pub struct Aimd {
+pub(crate) struct Aimd {
     /// Minimum limit for AIMD algorithm.
-    pub min: usize,
+    pub(crate) min: usize,
     /// Maximum limit for AIMD algorithm.
-    pub max: usize,
+    pub(crate) max: usize,
     /// Decrease AIMD decrease by value in case of error.
-    pub dec: f32,
+    pub(crate) dec: f32,
     /// Increase AIMD increase by value in case of success.
-    pub inc: usize,
+    pub(crate) inc: usize,
     /// A threshold below which the limit won't be increased.
-    pub utilisation: f32,
+    pub(crate) utilisation: f32,
 }
 
 impl LimitAlgorithm for Aimd {
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 5db4efed37..be529f174d 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -17,13 +17,13 @@ use tracing::info;
 
 use crate::intern::EndpointIdInt;
 
-pub struct GlobalRateLimiter {
+pub(crate) struct GlobalRateLimiter {
     data: Vec<RateBucket>,
     info: Vec<RateBucketInfo>,
 }
 
 impl GlobalRateLimiter {
-    pub fn new(info: Vec<RateBucketInfo>) -> Self {
+    pub(crate) fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -37,7 +37,7 @@ impl GlobalRateLimiter {
     }
 
     /// Check that number of connections is below `max_rps` rps.
-    pub fn check(&mut self) -> bool {
+    pub(crate) fn check(&mut self) -> bool {
         let now = Instant::now();
 
         let should_allow_request = self
@@ -96,9 +96,9 @@ impl RateBucket {
 
 #[derive(Clone, Copy, PartialEq)]
 pub struct RateBucketInfo {
-    pub interval: Duration,
+    pub(crate) interval: Duration,
     // requests per interval
-    pub max_rpi: u32,
+    pub(crate) max_rpi: u32,
 }
 
 impl std::fmt::Display for RateBucketInfo {
@@ -192,7 +192,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, key: K, n: u32) -> bool {
+    pub(crate) fn check(&self, key: K, n: u32) -> bool {
         // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
         // worst case memory usage is about:
         //    = 2 * 2048 * 64 * (48B + 72B)
@@ -228,7 +228,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
     /// Clean the map. Simple strategy: remove all entries in a random shard.
     /// At worst, we'll double the effective max_rps during the cleanup.
     /// But that way deletion does not aquire mutex on each entry access.
-    pub fn do_gc(&self) {
+    pub(crate) fn do_gc(&self) {
         info!(
             "cleaning up bucket rate limiter, current size = {}",
             self.map.len()
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index c9a946fa4a..95bdfc0965 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -109,7 +109,7 @@ impl RedisPublisherClient {
         let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
         Ok(())
     }
-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> {
         match self.client.connect().await {
             Ok(()) => {}
             Err(e) => {
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index c78ee166f1..7d222e2dec 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -81,7 +81,7 @@ impl ConnectionWithCredentialsProvider {
         redis::cmd("PING").query_async(con).await
     }
 
-    pub async fn connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
             match Self::ping(con).await {
@@ -149,7 +149,7 @@ impl ConnectionWithCredentialsProvider {
 
     // PubSub does not support credentials refresh.
     // Requires manual reconnection every 12h.
-    pub async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
+    pub(crate) async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
         Ok(self.get_client().await?.get_async_pubsub().await?)
     }
 
@@ -187,7 +187,10 @@ impl ConnectionWithCredentialsProvider {
     }
     /// Sends an already encoded (packed) command into the TCP socket and
     /// reads the single response from it.
-    pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult<redis::Value> {
+    pub(crate) async fn send_packed_command(
+        &mut self,
+        cmd: &redis::Cmd,
+    ) -> RedisResult<redis::Value> {
         // Clone connection to avoid having to lock the ArcSwap in write mode
         let con = self.con.as_mut().ok_or(redis::RedisError::from((
             redis::ErrorKind::IoError,
@@ -199,7 +202,7 @@ impl ConnectionWithCredentialsProvider {
     /// Sends multiple already encoded (packed) command into the TCP socket
     /// and reads `count` responses from it.  This is used to implement
     /// pipelining.
-    pub async fn send_packed_commands(
+    pub(crate) async fn send_packed_commands(
         &mut self,
         cmd: &redis::Pipeline,
         offset: usize,
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index eded8250af..d118c8f412 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -51,7 +51,7 @@ impl CredentialsProvider {
             credentials_provider,
         }
     }
-    pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+    pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
         let aws_credentials = self
             .credentials_provider
             .provide_credentials()
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 31c0e62c2c..36a3443603 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -58,9 +58,9 @@ pub(crate) struct PasswordUpdate {
 }
 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct CancelSession {
-    pub region_id: Option<String>,
-    pub cancel_key_data: CancelKeyData,
-    pub session_id: Uuid,
+    pub(crate) region_id: Option<String>,
+    pub(crate) cancel_key_data: CancelKeyData,
+    pub(crate) session_id: Uuid,
 }
 
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
@@ -89,7 +89,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
 }
 
 impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
-    pub fn new(
+    pub(crate) fn new(
         cache: Arc<C>,
         cancellation_handler: Arc<CancellationHandler<()>>,
         region_id: String,
@@ -100,10 +100,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             region_id,
         }
     }
-    pub async fn increment_active_listeners(&self) {
+    pub(crate) async fn increment_active_listeners(&self) {
         self.cache.increment_active_listeners().await;
     }
-    pub async fn decrement_active_listeners(&self) {
+    pub(crate) async fn decrement_active_listeners(&self) {
         self.cache.decrement_active_listeners().await;
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 60207fc824..0a36694359 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -14,13 +14,13 @@ use crate::error::{ReportableError, UserFacingError};
 use std::io;
 use thiserror::Error;
 
-pub use channel_binding::ChannelBinding;
-pub use messages::FirstMessage;
-pub use stream::{Outcome, SaslStream};
+pub(crate) use channel_binding::ChannelBinding;
+pub(crate) use messages::FirstMessage;
+pub(crate) use stream::{Outcome, SaslStream};
 
 /// Fine-grained auth errors help in writing tests.
 #[derive(Error, Debug)]
-pub enum Error {
+pub(crate) enum Error {
     #[error("Channel binding failed: {0}")]
     ChannelBindingFailed(&'static str),
 
@@ -64,11 +64,11 @@ impl ReportableError for Error {
 }
 
 /// A convenient result type for SASL exchange.
-pub type Result<T> = std::result::Result<T, Error>;
+pub(crate) type Result<T> = std::result::Result<T, Error>;
 
 /// A result of one SASL exchange.
 #[must_use]
-pub enum Step<T, R> {
+pub(crate) enum Step<T, R> {
     /// We should continue exchanging messages.
     Continue(T, String),
     /// The client has been authenticated successfully.
@@ -78,7 +78,7 @@ pub enum Step<T, R> {
 }
 
 /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait.
-pub trait Mechanism: Sized {
+pub(crate) trait Mechanism: Sized {
     /// What's produced as a result of successful authentication.
     type Output;
 
diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs
index 6e2d3057ce..fdd011448e 100644
--- a/proxy/src/sasl/channel_binding.rs
+++ b/proxy/src/sasl/channel_binding.rs
@@ -2,7 +2,7 @@
 
 /// Channel binding flag (possibly with params).
 #[derive(Debug, PartialEq, Eq)]
-pub enum ChannelBinding<T> {
+pub(crate) enum ChannelBinding<T> {
     /// Client doesn't support channel binding.
     NotSupportedClient,
     /// Client thinks server doesn't support channel binding.
@@ -12,7 +12,10 @@ pub enum ChannelBinding<T> {
 }
 
 impl<T> ChannelBinding<T> {
-    pub fn and_then<R, E>(self, f: impl FnOnce(T) -> Result<R, E>) -> Result<ChannelBinding<R>, E> {
+    pub(crate) fn and_then<R, E>(
+        self,
+        f: impl FnOnce(T) -> Result<R, E>,
+    ) -> Result<ChannelBinding<R>, E> {
         Ok(match self {
             Self::NotSupportedClient => ChannelBinding::NotSupportedClient,
             Self::NotSupportedServer => ChannelBinding::NotSupportedServer,
@@ -23,7 +26,7 @@ impl<T> ChannelBinding<T> {
 
 impl<'a> ChannelBinding<&'a str> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(input: &'a str) -> Option<Self> {
+    pub(crate) fn parse(input: &'a str) -> Option<Self> {
         Some(match input {
             "n" => Self::NotSupportedClient,
             "y" => Self::NotSupportedServer,
@@ -34,7 +37,7 @@ impl<'a> ChannelBinding<&'a str> {
 
 impl<T: std::fmt::Display> ChannelBinding<T> {
     /// Encode channel binding data as base64 for subsequent checks.
-    pub fn encode<'a, E>(
+    pub(crate) fn encode<'a, E>(
         &self,
         get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
     ) -> Result<std::borrow::Cow<'static, str>, E> {
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index 2b5ae1785d..6c9a42b2db 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -5,16 +5,16 @@ use pq_proto::{BeAuthenticationSaslMessage, BeMessage};
 
 /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage).
 #[derive(Debug)]
-pub struct FirstMessage<'a> {
+pub(crate) struct FirstMessage<'a> {
     /// Authentication method, e.g. `"SCRAM-SHA-256"`.
-    pub method: &'a str,
+    pub(crate) method: &'a str,
     /// Initial client message.
-    pub message: &'a str,
+    pub(crate) message: &'a str,
 }
 
 impl<'a> FirstMessage<'a> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(bytes: &'a [u8]) -> Option<Self> {
+    pub(crate) fn parse(bytes: &'a [u8]) -> Option<Self> {
         let (method_cstr, tail) = split_cstr(bytes)?;
         let method = method_cstr.to_str().ok()?;
 
diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs
index 9115b0f61a..b6becd28e1 100644
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -7,7 +7,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 /// Abstracts away all peculiarities of the libpq's protocol.
-pub struct SaslStream<'a, S> {
+pub(crate) struct SaslStream<'a, S> {
     /// The underlying stream.
     stream: &'a mut PqStream<S>,
     /// Current password message we received from client.
@@ -17,7 +17,7 @@ pub struct SaslStream<'a, S> {
 }
 
 impl<'a, S> SaslStream<'a, S> {
-    pub fn new(stream: &'a mut PqStream<S>, first: &'a str) -> Self {
+    pub(crate) fn new(stream: &'a mut PqStream<S>, first: &'a str) -> Self {
         Self {
             stream,
             current: bytes::Bytes::new(),
@@ -53,7 +53,7 @@ impl<S: AsyncWrite + Unpin> SaslStream<'_, S> {
 /// It's much easier to match on those two variants
 /// than to peek into a noisy protocol error type.
 #[must_use = "caller must explicitly check for success"]
-pub enum Outcome<R> {
+pub(crate) enum Outcome<R> {
     /// Authentication succeeded and produced some value.
     Success(R),
     /// Authentication failed (reason attached).
@@ -63,7 +63,7 @@ pub enum Outcome<R> {
 impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
     /// Perform SASL message exchange according to the underlying algorithm
     /// until user is either authenticated or denied access.
-    pub async fn authenticate<M: Mechanism>(
+    pub(crate) async fn authenticate<M: Mechanism>(
         mut self,
         mut mechanism: M,
     ) -> super::Result<Outcome<M::Output>> {
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 145e727a74..d058f1c3f8 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -15,9 +15,9 @@ mod secret;
 mod signature;
 pub mod threadpool;
 
-pub use exchange::{exchange, Exchange};
-pub use key::ScramKey;
-pub use secret::ServerSecret;
+pub(crate) use exchange::{exchange, Exchange};
+pub(crate) use key::ScramKey;
+pub(crate) use secret::ServerSecret;
 
 use hmac::{Hmac, Mac};
 use sha2::{Digest, Sha256};
@@ -26,8 +26,8 @@ const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
 const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS";
 
 /// A list of supported SCRAM methods.
-pub const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256];
-pub const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256];
+pub(crate) const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256];
+pub(crate) const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256];
 
 /// Decode base64 into array without any heap allocations
 fn base64_decode_array<const N: usize>(input: impl AsRef<[u8]>) -> Option<[u8; N]> {
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index 944bb3c83e..255694b33e 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -2,7 +2,7 @@ use std::hash::Hash;
 
 /// estimator of hash jobs per second.
 /// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
-pub struct CountMinSketch {
+pub(crate) struct CountMinSketch {
     // one for each depth
     hashers: Vec<ahash::RandomState>,
     width: usize,
@@ -20,7 +20,7 @@ impl CountMinSketch {
     /// actual <= estimate
     /// estimate <= actual + ε * N with probability 1 - δ
     /// where N is the cardinality of the stream
-    pub fn with_params(epsilon: f64, delta: f64) -> Self {
+    pub(crate) fn with_params(epsilon: f64, delta: f64) -> Self {
         CountMinSketch::new(
             (std::f64::consts::E / epsilon).ceil() as usize,
             (1.0_f64 / delta).ln().ceil() as usize,
@@ -49,7 +49,7 @@ impl CountMinSketch {
         }
     }
 
-    pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
+    pub(crate) fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
         let mut min = u32::MAX;
         for row in 0..self.depth {
             let col = (self.hashers[row].hash_one(t) as usize) % self.width;
@@ -61,7 +61,7 @@ impl CountMinSketch {
         min
     }
 
-    pub fn reset(&mut self) {
+    pub(crate) fn reset(&mut self) {
         self.buckets.clear();
         self.buckets.resize(self.width * self.depth, 0);
     }
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index f2494379a5..7fdadc7038 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -56,14 +56,14 @@ enum ExchangeState {
 }
 
 /// Server's side of SCRAM auth algorithm.
-pub struct Exchange<'a> {
+pub(crate) struct Exchange<'a> {
     state: ExchangeState,
     secret: &'a ServerSecret,
     tls_server_end_point: config::TlsServerEndPoint,
 }
 
 impl<'a> Exchange<'a> {
-    pub fn new(
+    pub(crate) fn new(
         secret: &'a ServerSecret,
         nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
         tls_server_end_point: config::TlsServerEndPoint,
@@ -101,7 +101,7 @@ async fn derive_client_key(
     make_key(b"Client Key").into()
 }
 
-pub async fn exchange(
+pub(crate) async fn exchange(
     pool: &ThreadPool,
     endpoint: EndpointIdInt,
     secret: &ServerSecret,
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index 32a3dbd203..fe55ff493b 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -3,14 +3,14 @@
 use subtle::ConstantTimeEq;
 
 /// Faithfully taken from PostgreSQL.
-pub const SCRAM_KEY_LEN: usize = 32;
+pub(crate) const SCRAM_KEY_LEN: usize = 32;
 
 /// One of the keys derived from the user's password.
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
 #[derive(Clone, Default, Eq, Debug)]
 #[repr(transparent)]
-pub struct ScramKey {
+pub(crate) struct ScramKey {
     bytes: [u8; SCRAM_KEY_LEN],
 }
 
@@ -27,11 +27,11 @@ impl ConstantTimeEq for ScramKey {
 }
 
 impl ScramKey {
-    pub fn sha256(&self) -> Self {
+    pub(crate) fn sha256(&self) -> Self {
         super::sha256([self.as_ref()]).into()
     }
 
-    pub fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] {
+    pub(crate) fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] {
         self.bytes
     }
 }
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index 54157e450d..fd9e77764c 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -8,7 +8,7 @@ use std::fmt;
 use std::ops::Range;
 
 /// Faithfully taken from PostgreSQL.
-pub const SCRAM_RAW_NONCE_LEN: usize = 18;
+pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18;
 
 /// Although we ignore all extensions, we still have to validate the message.
 fn validate_sasl_extensions<'a>(parts: impl Iterator<Item = &'a str>) -> Option<()> {
@@ -27,18 +27,18 @@ fn validate_sasl_extensions<'a>(parts: impl Iterator<Item = &'a str>) -> Option<
 }
 
 #[derive(Debug)]
-pub struct ClientFirstMessage<'a> {
+pub(crate) struct ClientFirstMessage<'a> {
     /// `client-first-message-bare`.
-    pub bare: &'a str,
+    pub(crate) bare: &'a str,
     /// Channel binding mode.
-    pub cbind_flag: ChannelBinding<&'a str>,
+    pub(crate) cbind_flag: ChannelBinding<&'a str>,
     /// Client nonce.
-    pub nonce: &'a str,
+    pub(crate) nonce: &'a str,
 }
 
 impl<'a> ClientFirstMessage<'a> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(input: &'a str) -> Option<Self> {
+    pub(crate) fn parse(input: &'a str) -> Option<Self> {
         let mut parts = input.split(',');
 
         let cbind_flag = ChannelBinding::parse(parts.next()?)?;
@@ -77,7 +77,7 @@ impl<'a> ClientFirstMessage<'a> {
     }
 
     /// Build a response to [`ClientFirstMessage`].
-    pub fn build_server_first_message(
+    pub(crate) fn build_server_first_message(
         &self,
         nonce: &[u8; SCRAM_RAW_NONCE_LEN],
         salt_base64: &str,
@@ -101,20 +101,20 @@ impl<'a> ClientFirstMessage<'a> {
 }
 
 #[derive(Debug)]
-pub struct ClientFinalMessage<'a> {
+pub(crate) struct ClientFinalMessage<'a> {
     /// `client-final-message-without-proof`.
-    pub without_proof: &'a str,
+    pub(crate) without_proof: &'a str,
     /// Channel binding data (base64).
-    pub channel_binding: &'a str,
+    pub(crate) channel_binding: &'a str,
     /// Combined client & server nonce.
-    pub nonce: &'a str,
+    pub(crate) nonce: &'a str,
     /// Client auth proof.
-    pub proof: [u8; SCRAM_KEY_LEN],
+    pub(crate) proof: [u8; SCRAM_KEY_LEN],
 }
 
 impl<'a> ClientFinalMessage<'a> {
     // NB: FromStr doesn't work with lifetimes
-    pub fn parse(input: &'a str) -> Option<Self> {
+    pub(crate) fn parse(input: &'a str) -> Option<Self> {
         let (without_proof, proof) = input.rsplit_once(',')?;
 
         let mut parts = without_proof.split(',');
@@ -135,7 +135,7 @@ impl<'a> ClientFinalMessage<'a> {
     }
 
     /// Build a response to [`ClientFinalMessage`].
-    pub fn build_server_final_message(
+    pub(crate) fn build_server_final_message(
         &self,
         signature_builder: SignatureBuilder<'_>,
         server_key: &ScramKey,
@@ -153,7 +153,7 @@ impl<'a> ClientFinalMessage<'a> {
 
 /// We need to keep a convenient representation of this
 /// message for the next authentication step.
-pub struct OwnedServerFirstMessage {
+pub(crate) struct OwnedServerFirstMessage {
     /// Owned `server-first-message`.
     message: String,
     /// Slice into `message`.
@@ -163,13 +163,13 @@ pub struct OwnedServerFirstMessage {
 impl OwnedServerFirstMessage {
     /// Extract combined nonce from the message.
     #[inline(always)]
-    pub fn nonce(&self) -> &str {
+    pub(crate) fn nonce(&self) -> &str {
         &self.message[self.nonce.clone()]
     }
 
     /// Get reference to a text representation of the message.
     #[inline(always)]
-    pub fn as_str(&self) -> &str {
+    pub(crate) fn as_str(&self) -> &str {
         &self.message
     }
 }
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index f690cc7738..d5ed9002ad 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -4,7 +4,7 @@ use hmac::{
 };
 use sha2::Sha256;
 
-pub struct Pbkdf2 {
+pub(crate) struct Pbkdf2 {
     hmac: Hmac<Sha256>,
     prev: GenericArray<u8, U32>,
     hi: GenericArray<u8, U32>,
@@ -13,7 +13,7 @@ pub struct Pbkdf2 {
 
 // inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
 impl Pbkdf2 {
-    pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
+    pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
         let hmac =
             Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
 
@@ -33,11 +33,11 @@ impl Pbkdf2 {
         }
     }
 
-    pub fn cost(&self) -> u32 {
+    pub(crate) fn cost(&self) -> u32 {
         (self.iterations).clamp(0, 4096)
     }
 
-    pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
+    pub(crate) fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
         let Self {
             hmac,
             prev,
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index a08cb943c3..8c6a08d432 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -8,22 +8,22 @@ use super::key::ScramKey;
 /// Server secret is produced from user's password,
 /// and is used throughout the authentication process.
 #[derive(Clone, Eq, PartialEq, Debug)]
-pub struct ServerSecret {
+pub(crate) struct ServerSecret {
     /// Number of iterations for `PBKDF2` function.
-    pub iterations: u32,
+    pub(crate) iterations: u32,
     /// Salt used to hash user's password.
-    pub salt_base64: String,
+    pub(crate) salt_base64: String,
     /// Hashed `ClientKey`.
-    pub stored_key: ScramKey,
+    pub(crate) stored_key: ScramKey,
     /// Used by client to verify server's signature.
-    pub server_key: ScramKey,
+    pub(crate) server_key: ScramKey,
     /// Should auth fail no matter what?
     /// This is exactly the case for mocked secrets.
-    pub doomed: bool,
+    pub(crate) doomed: bool,
 }
 
 impl ServerSecret {
-    pub fn parse(input: &str) -> Option<Self> {
+    pub(crate) fn parse(input: &str) -> Option<Self> {
         // SCRAM-SHA-256$<iterations>:<salt>$<storedkey>:<serverkey>
         let s = input.strip_prefix("SCRAM-SHA-256$")?;
         let (params, keys) = s.split_once('$')?;
@@ -42,7 +42,7 @@ impl ServerSecret {
         Some(secret)
     }
 
-    pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
+    pub(crate) fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
         // constant time to not leak partial key match
         client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8)
     }
@@ -50,7 +50,7 @@ impl ServerSecret {
     /// To avoid revealing information to an attacker, we use a
     /// mocked server secret even if the user doesn't exist.
     /// See `auth-scram.c : mock_scram_secret` for details.
-    pub fn mock(nonce: [u8; 32]) -> Self {
+    pub(crate) fn mock(nonce: [u8; 32]) -> Self {
         Self {
             // this doesn't reveal much information as we're going to use
             // iteration count 1 for our generated passwords going forward.
@@ -66,7 +66,7 @@ impl ServerSecret {
     /// Build a new server secret from the prerequisites.
     /// XXX: We only use this function in tests.
     #[cfg(test)]
-    pub async fn build(password: &str) -> Option<Self> {
+    pub(crate) async fn build(password: &str) -> Option<Self> {
         Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await)
     }
 }
diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs
index 1c2811d757..d3255cf2ca 100644
--- a/proxy/src/scram/signature.rs
+++ b/proxy/src/scram/signature.rs
@@ -4,14 +4,14 @@ use super::key::{ScramKey, SCRAM_KEY_LEN};
 
 /// A collection of message parts needed to derive the client's signature.
 #[derive(Debug)]
-pub struct SignatureBuilder<'a> {
-    pub client_first_message_bare: &'a str,
-    pub server_first_message: &'a str,
-    pub client_final_message_without_proof: &'a str,
+pub(crate) struct SignatureBuilder<'a> {
+    pub(crate) client_first_message_bare: &'a str,
+    pub(crate) server_first_message: &'a str,
+    pub(crate) client_final_message_without_proof: &'a str,
 }
 
 impl SignatureBuilder<'_> {
-    pub fn build(&self, key: &ScramKey) -> Signature {
+    pub(crate) fn build(&self, key: &ScramKey) -> Signature {
         let parts = [
             self.client_first_message_bare.as_bytes(),
             b",",
@@ -28,13 +28,13 @@ impl SignatureBuilder<'_> {
 /// produces `ClientKey` that we need for authentication.
 #[derive(Debug)]
 #[repr(transparent)]
-pub struct Signature {
+pub(crate) struct Signature {
     bytes: [u8; SCRAM_KEY_LEN],
 }
 
 impl Signature {
     /// Derive `ClientKey` from client's signature and proof.
-    pub fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey {
+    pub(crate) fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey {
         // This is how the proof is calculated:
         //
         // 1. sha256(ClientKey) -> StoredKey
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 8fbaecf93d..262c6d146e 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -68,7 +68,7 @@ impl ThreadPool {
         pool
     }
 
-    pub fn spawn_job(
+    pub(crate) fn spawn_job(
         &self,
         endpoint: EndpointIdInt,
         pbkdf2: Pbkdf2,
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index d9a9019746..84f98cb8ad 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -25,8 +25,6 @@ use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
-pub use reqwest_middleware::{ClientWithMiddleware, Error};
-pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::timeout;
 use tokio_rustls::TlsAcceptor;
@@ -50,7 +48,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn, Instrument};
 use utils::http::error::ApiError;
 
-pub const SERVERLESS_DRIVER_SNI: &str = "api";
+pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
 
 pub async fn task_main(
     config: &'static ProxyConfig,
@@ -178,9 +176,9 @@ pub async fn task_main(
     Ok(())
 }
 
-pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
+pub(crate) trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
 impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
-pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
+pub(crate) type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
 
 #[async_trait]
 trait MaybeTlsAcceptor: Send + Sync + 'static {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 9cc271c588..2699411c28 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -29,14 +29,14 @@ use crate::{
 
 use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
 
-pub struct PoolingBackend {
-    pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
-    pub config: &'static ProxyConfig,
-    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+pub(crate) struct PoolingBackend {
+    pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pub(crate) config: &'static ProxyConfig,
+    pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }
 
 impl PoolingBackend {
-    pub async fn authenticate_with_password(
+    pub(crate) async fn authenticate_with_password(
         &self,
         ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
@@ -98,7 +98,7 @@ impl PoolingBackend {
         })
     }
 
-    pub async fn authenticate_with_jwt(
+    pub(crate) async fn authenticate_with_jwt(
         &self,
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
@@ -135,7 +135,7 @@ impl PoolingBackend {
     // we reuse the code from the usual proxy and we need to prepare few structures
     // that this code expects.
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
-    pub async fn connect_to_compute(
+    pub(crate) async fn connect_to_compute(
         &self,
         ctx: &RequestMonitoring,
         conn_info: ConnInfo,
@@ -175,7 +175,7 @@ impl PoolingBackend {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum HttpConnError {
+pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
     ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
     #[error("could not connection to compute")]
diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs
index 390df7f4f7..7659745473 100644
--- a/proxy/src/serverless/cancel_set.rs
+++ b/proxy/src/serverless/cancel_set.rs
@@ -22,7 +22,7 @@ pub struct CancelSet {
     hasher: Hasher,
 }
 
-pub struct CancelShard {
+pub(crate) struct CancelShard {
     tokens: IndexMap<uuid::Uuid, (Instant, CancellationToken), Hasher>,
 }
 
@@ -40,7 +40,7 @@ impl CancelSet {
         }
     }
 
-    pub fn take(&self) -> Option<CancellationToken> {
+    pub(crate) fn take(&self) -> Option<CancellationToken> {
         for _ in 0..4 {
             if let Some(token) = self.take_raw(thread_rng().gen()) {
                 return Some(token);
@@ -50,12 +50,12 @@ impl CancelSet {
         None
     }
 
-    pub fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
+    pub(crate) fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
         NonZeroUsize::new(self.shards.len())
             .and_then(|len| self.shards[rng % len].lock().take(rng / len))
     }
 
-    pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
+    pub(crate) fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
         let shard = NonZeroUsize::new(self.shards.len()).map(|len| {
             let hash = self.hasher.hash_one(id) as usize;
             let shard = &self.shards[hash % len];
@@ -88,7 +88,7 @@ impl CancelShard {
     }
 }
 
-pub struct CancelGuard<'a> {
+pub(crate) struct CancelGuard<'a> {
     shard: Option<&'a Mutex<CancelShard>>,
     id: Uuid,
 }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 476083d71e..bea599e9b9 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -30,25 +30,25 @@ use tracing::{info, info_span, Instrument};
 use super::backend::HttpConnError;
 
 #[derive(Debug, Clone)]
-pub struct ConnInfo {
-    pub user_info: ComputeUserInfo,
-    pub dbname: DbName,
-    pub auth: AuthData,
+pub(crate) struct ConnInfo {
+    pub(crate) user_info: ComputeUserInfo,
+    pub(crate) dbname: DbName,
+    pub(crate) auth: AuthData,
 }
 
 #[derive(Debug, Clone)]
-pub enum AuthData {
+pub(crate) enum AuthData {
     Password(SmallVec<[u8; 16]>),
     Jwt(String),
 }
 
 impl ConnInfo {
     // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (DbName, RoleName) {
+    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
         (self.dbname.clone(), self.user_info.user.clone())
     }
 
-    pub fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
         // We don't want to cache http connections for ephemeral endpoints.
         if self.user_info.options.is_ephemeral() {
             None
@@ -79,7 +79,7 @@ struct ConnPoolEntry<C: ClientInnerExt> {
 
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub struct EndpointConnPool<C: ClientInnerExt> {
+pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
     pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
     max_conns: usize,
@@ -198,7 +198,7 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
     }
 }
 
-pub struct DbUserConnPool<C: ClientInnerExt> {
+pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
     conns: Vec<ConnPoolEntry<C>>,
 }
 
@@ -241,7 +241,7 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
     }
 }
 
-pub struct GlobalConnPool<C: ClientInnerExt> {
+pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
@@ -282,7 +282,7 @@ pub struct GlobalConnPoolOptions {
 }
 
 impl<C: ClientInnerExt> GlobalConnPool<C> {
-    pub fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
             global_pool: DashMap::with_shard_amount(shards),
@@ -293,21 +293,21 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
     }
 
     #[cfg(test)]
-    pub fn get_global_connections_count(&self) -> usize {
+    pub(crate) fn get_global_connections_count(&self) -> usize {
         self.global_connections_count
             .load(atomic::Ordering::Relaxed)
     }
 
-    pub fn get_idle_timeout(&self) -> Duration {
+    pub(crate) fn get_idle_timeout(&self) -> Duration {
         self.config.pool_options.idle_timeout
     }
 
-    pub fn shutdown(&self) {
+    pub(crate) fn shutdown(&self) {
         // drops all strong references to endpoint-pools
         self.global_pool.clear();
     }
 
-    pub async fn gc_worker(&self, mut rng: impl Rng) {
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
         let epoch = self.config.pool_options.gc_epoch;
         let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
         loop {
@@ -381,7 +381,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         }
     }
 
-    pub fn get(
+    pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
@@ -468,7 +468,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
     }
 }
 
-pub fn poll_client<C: ClientInnerExt>(
+pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
     ctx: &RequestMonitoring,
     conn_info: ConnInfo,
@@ -596,7 +596,7 @@ impl<C: ClientInnerExt> Drop for ClientInner<C> {
     }
 }
 
-pub trait ClientInnerExt: Sync + Send + 'static {
+pub(crate) trait ClientInnerExt: Sync + Send + 'static {
     fn is_closed(&self) -> bool;
     fn get_process_id(&self) -> i32;
 }
@@ -611,13 +611,13 @@ impl ClientInnerExt for tokio_postgres::Client {
 }
 
 impl<C: ClientInnerExt> ClientInner<C> {
-    pub fn is_closed(&self) -> bool {
+    pub(crate) fn is_closed(&self) -> bool {
         self.inner.is_closed()
     }
 }
 
 impl<C: ClientInnerExt> Client<C> {
-    pub fn metrics(&self) -> Arc<MetricCounter> {
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
         let aux = &self.inner.as_ref().unwrap().aux;
         USAGE_METRICS.register(Ids {
             endpoint_id: aux.endpoint_id,
@@ -626,14 +626,14 @@ impl<C: ClientInnerExt> Client<C> {
     }
 }
 
-pub struct Client<C: ClientInnerExt> {
+pub(crate) struct Client<C: ClientInnerExt> {
     span: Span,
     inner: Option<ClientInner<C>>,
     conn_info: ConnInfo,
     pool: Weak<RwLock<EndpointConnPool<C>>>,
 }
 
-pub struct Discard<'a, C: ClientInnerExt> {
+pub(crate) struct Discard<'a, C: ClientInnerExt> {
     conn_info: &'a ConnInfo,
     pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
@@ -651,7 +651,7 @@ impl<C: ClientInnerExt> Client<C> {
             pool,
         }
     }
-    pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
+    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
             pool,
@@ -664,13 +664,13 @@ impl<C: ClientInnerExt> Client<C> {
 }
 
 impl<C: ClientInnerExt> Discard<'_, C> {
-    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
             info!("pool: throwing away connection '{conn_info}' because connection is not idle");
         }
     }
-    pub fn discard(&mut self) {
+    pub(crate) fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
             info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index 701ab58f63..abf0ffe290 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use utils::http::error::ApiError;
 
 /// Like [`ApiError::into_response`]
-pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
+pub(crate) fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
     match this {
         ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
             format!("{err:#?}"), // use debug printing so that we give the cause
@@ -59,7 +59,7 @@ pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
 /// Same as [`utils::http::error::HttpErrorBody`]
 #[derive(Serialize)]
 struct HttpErrorBody {
-    pub msg: String,
+    pub(crate) msg: String,
 }
 
 impl HttpErrorBody {
@@ -80,7 +80,7 @@ impl HttpErrorBody {
 }
 
 /// Same as [`utils::http::json::json_response`]
-pub fn json_response<T: Serialize>(
+pub(crate) fn json_response<T: Serialize>(
     status: StatusCode,
     data: T,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 3776971fa1..9f328a0e1d 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -8,7 +8,7 @@ use tokio_postgres::Row;
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-pub fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
+pub(crate) fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
     json.iter().map(json_value_to_pg_text).collect()
 }
 
@@ -61,7 +61,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum JsonConversionError {
+pub(crate) enum JsonConversionError {
     #[error("internal error compute returned invalid data: {0}")]
     AsTextError(tokio_postgres::Error),
     #[error("parse int error: {0}")]
@@ -77,7 +77,7 @@ pub enum JsonConversionError {
 //
 // Convert postgres row with text-encoded values to JSON object
 //
-pub fn pg_text_row_to_json(
+pub(crate) fn pg_text_row_to_json(
     row: &Row,
     columns: &[Type],
     raw_output: bool,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 9143469eea..5b36f5e91d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -110,7 +110,7 @@ where
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum ConnInfoError {
+pub(crate) enum ConnInfoError {
     #[error("invalid header: {0}")]
     InvalidHeader(&'static HeaderName),
     #[error("invalid connection string: {0}")]
@@ -246,7 +246,7 @@ fn get_conn_info(
 }
 
 // TODO: return different http error codes
-pub async fn handle(
+pub(crate) async fn handle(
     config: &'static ProxyConfig,
     ctx: RequestMonitoring,
     request: Request<Incoming>,
@@ -359,7 +359,7 @@ pub async fn handle(
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum SqlOverHttpError {
+pub(crate) enum SqlOverHttpError {
     #[error("{0}")]
     ReadPayload(#[from] ReadPayloadError),
     #[error("{0}")]
@@ -413,7 +413,7 @@ impl UserFacingError for SqlOverHttpError {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum ReadPayloadError {
+pub(crate) enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
     Read(#[from] hyper1::Error),
     #[error("could not parse the HTTP request body: {0}")]
@@ -430,7 +430,7 @@ impl ReportableError for ReadPayloadError {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum SqlOverHttpCancel {
+pub(crate) enum SqlOverHttpCancel {
     #[error("query was cancelled")]
     Postgres,
     #[error("query was cancelled while stuck trying to connect to the database")]
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 4fba4d141c..3d257223b8 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -27,7 +27,7 @@ use tracing::warn;
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw<S> {
+    pub(crate) struct WebSocketRw<S> {
         #[pin]
         stream: WebSocketServer<S>,
         recv: Bytes,
@@ -36,7 +36,7 @@ pin_project! {
 }
 
 impl<S> WebSocketRw<S> {
-    pub fn new(stream: WebSocketServer<S>) -> Self {
+    pub(crate) fn new(stream: WebSocketServer<S>) -> Self {
         Self {
             stream,
             recv: Bytes::new(),
@@ -127,7 +127,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 }
 
-pub async fn serve_websocket(
+pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
     ctx: RequestMonitoring,
     websocket: OnUpgrade,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index ef13f5fc1a..332dc27787 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -35,7 +35,7 @@ impl<S> PqStream<S> {
     }
 
     /// Get a shared reference to the underlying stream.
-    pub fn get_ref(&self) -> &S {
+    pub(crate) fn get_ref(&self) -> &S {
         self.framed.get_ref()
     }
 }
@@ -62,7 +62,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
             .ok_or_else(err_connection)
     }
 
-    pub async fn read_password_message(&mut self) -> io::Result<bytes::Bytes> {
+    pub(crate) async fn read_password_message(&mut self) -> io::Result<bytes::Bytes> {
         match self.read_message().await? {
             FeMessage::PasswordMessage(msg) => Ok(msg),
             bad => Err(io::Error::new(
@@ -99,7 +99,10 @@ impl ReportableError for ReportedError {
 
 impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// Write the message into an internal buffer, but don't flush the underlying stream.
-    pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
+    pub(crate) fn write_message_noflush(
+        &mut self,
+        message: &BeMessage<'_>,
+    ) -> io::Result<&mut Self> {
         self.framed
             .write_message(message)
             .map_err(ProtocolError::into_io_error)?;
@@ -114,7 +117,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     }
 
     /// Flush the output buffer into the underlying stream.
-    pub async fn flush(&mut self) -> io::Result<&mut Self> {
+    pub(crate) async fn flush(&mut self) -> io::Result<&mut Self> {
         self.framed.flush().await?;
         Ok(self)
     }
@@ -146,7 +149,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
 
     /// Write the error message using [`Self::write_message`], then re-throw it.
     /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    pub async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
+    pub(crate) async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
     where
         E: UserFacingError + Into<anyhow::Error>,
     {
@@ -200,7 +203,7 @@ impl<S> Stream<S> {
         }
     }
 
-    pub fn tls_server_end_point(&self) -> TlsServerEndPoint {
+    pub(crate) fn tls_server_end_point(&self) -> TlsServerEndPoint {
         match self {
             Stream::Raw { .. } => TlsServerEndPoint::Undefined,
             Stream::Tls {
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 202fe8de1f..28ac7efdfc 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -7,12 +7,12 @@ pub struct ApiUrl(url::Url);
 
 impl ApiUrl {
     /// Consume the wrapper and return inner [url](url::Url).
-    pub fn into_inner(self) -> url::Url {
+    pub(crate) fn into_inner(self) -> url::Url {
         self.0
     }
 
     /// See [`url::Url::path_segments_mut`].
-    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
+    pub(crate) fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
         // We've already verified that it works during construction.
         self.0.path_segments_mut().expect("bad API url")
     }
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 4cf6da7e2d..aa8c7ba319 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -43,12 +43,12 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
-pub struct Ids {
-    pub endpoint_id: EndpointIdInt,
-    pub branch_id: BranchIdInt,
+pub(crate) struct Ids {
+    pub(crate) endpoint_id: EndpointIdInt,
+    pub(crate) branch_id: BranchIdInt,
 }
 
-pub trait MetricCounterRecorder {
+pub(crate) trait MetricCounterRecorder {
     /// Record that some bytes were sent from the proxy to the client
     fn record_egress(&self, bytes: u64);
     /// Record that some connections were opened
@@ -92,7 +92,7 @@ impl MetricCounterReporter for MetricBackupCounter {
 }
 
 #[derive(Debug)]
-pub struct MetricCounter {
+pub(crate) struct MetricCounter {
     transmitted: AtomicU64,
     opened_connections: AtomicUsize,
     backup: Arc<MetricBackupCounter>,
@@ -173,14 +173,14 @@ impl<C: MetricCounterReporter> Clearable for C {
 type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 
 #[derive(Default)]
-pub struct Metrics {
+pub(crate) struct Metrics {
     endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
     backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }
 
 impl Metrics {
     /// Register a new byte metrics counter for this endpoint
-    pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
+    pub(crate) fn register(&self, ids: Ids) -> Arc<MetricCounter> {
         let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
             entry.clone()
         } else {
@@ -215,7 +215,7 @@ impl Metrics {
     }
 }
 
-pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
+pub(crate) static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
 
 pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
     info!("metrics collector config: {config:?}");
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 9f78242ed3..86d0f9e8b2 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -7,13 +7,13 @@ use thiserror::Error;
 use tokio::sync::oneshot;
 
 #[derive(Debug, Error)]
-pub enum RegisterError {
+pub(crate) enum RegisterError {
     #[error("Waiter `{0}` already registered")]
     Occupied(String),
 }
 
 #[derive(Debug, Error)]
-pub enum NotifyError {
+pub(crate) enum NotifyError {
     #[error("Notify failed: waiter `{0}` not registered")]
     NotFound(String),
 
@@ -22,12 +22,12 @@ pub enum NotifyError {
 }
 
 #[derive(Debug, Error)]
-pub enum WaitError {
+pub(crate) enum WaitError {
     #[error("Wait failed: channel hangup")]
     Hangup,
 }
 
-pub struct Waiters<T>(pub(self) Mutex<HashMap<String, oneshot::Sender<T>>>);
+pub(crate) struct Waiters<T>(pub(self) Mutex<HashMap<String, oneshot::Sender<T>>>);
 
 impl<T> Default for Waiters<T> {
     fn default() -> Self {
@@ -36,7 +36,7 @@ impl<T> Default for Waiters<T> {
 }
 
 impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
+    pub(crate) fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
         let (tx, rx) = oneshot::channel();
 
         self.0
@@ -53,7 +53,7 @@ impl<T> Waiters<T> {
         })
     }
 
-    pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError>
+    pub(crate) fn notify(&self, key: &str, value: T) -> Result<(), NotifyError>
     where
         T: Send + Sync,
     {
@@ -79,7 +79,7 @@ impl<'a, T> Drop for DropKey<'a, T> {
 }
 
 pin_project! {
-    pub struct Waiter<'a, T> {
+    pub(crate) struct Waiter<'a, T> {
         #[pin]
         receiver: oneshot::Receiver<T>,
         guard: DropKey<'a, T>,

From 52cb33770b1f5e0215305e67582843f055e0b435 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 27 Aug 2024 14:12:42 +0200
Subject: [PATCH 039/159] proxy: Rename backend types and variants as prep for
 refactor (#8845)

* AuthBackend enum to AuthBackendType
* BackendType enum to Backend
* Link variants to Web
* Adjust messages, comments, etc.
---
 proxy/README.md                            |  2 +-
 proxy/src/auth.rs                          |  8 +--
 proxy/src/auth/backend.rs                  | 80 +++++++++++-----------
 proxy/src/auth/backend/{link.rs => web.rs} | 16 ++---
 proxy/src/auth/password_hack.rs            |  2 +-
 proxy/src/bin/local_proxy.rs               |  2 +-
 proxy/src/bin/proxy.rs                     | 27 ++++----
 proxy/src/compute.rs                       |  4 +-
 proxy/src/config.rs                        |  4 +-
 proxy/src/console/messages.rs              |  2 +-
 proxy/src/console/mgmt.rs                  |  2 +-
 proxy/src/context.rs                       |  2 +-
 proxy/src/proxy/tests.rs                   |  4 +-
 proxy/src/serverless/backend.rs            |  8 +--
 14 files changed, 84 insertions(+), 79 deletions(-)
 rename proxy/src/auth/backend/{link.rs => web.rs} (87%)

diff --git a/proxy/README.md b/proxy/README.md
index afc8b77db8..8d850737be 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -6,7 +6,7 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
   new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
 * postgres
   uses postgres to select auth secrets of existing roles. Useful for local testing
-* link
+* web (or link)
   sends login link for all usernames
 
 Also proxy can expose following services to the external world:
diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index f3ecedb839..7c408f817c 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,7 +1,7 @@
 //! Client authentication mechanisms.
 
 pub mod backend;
-pub use backend::BackendType;
+pub use backend::Backend;
 
 mod credentials;
 pub(crate) use credentials::{
@@ -31,7 +31,7 @@ pub(crate) type Result<T> = std::result::Result<T, AuthError>;
 #[derive(Debug, Error)]
 pub(crate) enum AuthErrorImpl {
     #[error(transparent)]
-    Link(#[from] backend::LinkAuthError),
+    Web(#[from] backend::WebAuthError),
 
     #[error(transparent)]
     GetAuthInfo(#[from] console::errors::GetAuthInfoError),
@@ -114,7 +114,7 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
         match self.0.as_ref() {
-            AuthErrorImpl::Link(e) => e.to_string_client(),
+            AuthErrorImpl::Web(e) => e.to_string_client(),
             AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
             AuthErrorImpl::Sasl(e) => e.to_string_client(),
             AuthErrorImpl::AuthFailed(_) => self.to_string(),
@@ -132,7 +132,7 @@ impl UserFacingError for AuthError {
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self.0.as_ref() {
-            AuthErrorImpl::Link(e) => e.get_error_kind(),
+            AuthErrorImpl::Web(e) => e.get_error_kind(),
             AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
             AuthErrorImpl::Sasl(e) => e.get_error_kind(),
             AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 77dea39fdc..1d28c6df31 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,19 +1,19 @@
 mod classic;
 mod hacks;
 pub mod jwt;
-mod link;
 pub mod local;
+mod web;
 
 use std::net::IpAddr;
 use std::sync::Arc;
 use std::time::Duration;
 
 use ipnet::{Ipv4Net, Ipv6Net};
-pub(crate) use link::LinkAuthError;
 use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
+pub(crate) use web::WebAuthError;
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::{validate_password_and_exchange, AuthError};
@@ -65,11 +65,11 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum BackendType<'a, T, D> {
+pub enum Backend<'a, T, D> {
     /// Cloud API (V2).
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(MaybeOwned<'a, url::ApiUrl>, D),
+    Web(MaybeOwned<'a, url::ApiUrl>, D),
     /// Local proxy uses configured auth credentials and does not wake compute
     Local(MaybeOwned<'a, LocalBackend>),
 }
@@ -82,7 +82,7 @@ pub(crate) trait TestBackend: Send + Sync + 'static {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
 }
 
-impl std::fmt::Display for BackendType<'_, (), ()> {
+impl std::fmt::Display for Backend<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Console(api, ()) => match &**api {
@@ -96,44 +96,44 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Self::Link(url, ()) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Web(url, ()) => fmt.debug_tuple("Web").field(&url.as_str()).finish(),
             Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
 }
 
-impl<T, D> BackendType<'_, T, D> {
+impl<T, D> Backend<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub(crate) fn as_ref(&self) -> BackendType<'_, &T, &D> {
+    pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> {
         match self {
-            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
-            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
-            Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
+            Self::Console(c, x) => Backend::Console(MaybeOwned::Borrowed(c), x),
+            Self::Web(c, x) => Backend::Web(MaybeOwned::Borrowed(c), x),
+            Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
         }
     }
 }
 
-impl<'a, T, D> BackendType<'a, T, D> {
+impl<'a, T, D> Backend<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
-    /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
+    /// Maps [`Backend<T>`] to [`Backend<R>`] by applying
     /// a function to a contained value.
-    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
+    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> {
         match self {
-            Self::Console(c, x) => BackendType::Console(c, f(x)),
-            Self::Link(c, x) => BackendType::Link(c, x),
-            Self::Local(l) => BackendType::Local(l),
+            Self::Console(c, x) => Backend::Console(c, f(x)),
+            Self::Web(c, x) => Backend::Web(c, x),
+            Self::Local(l) => Backend::Local(l),
         }
     }
 }
-impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
+impl<'a, T, D, E> Backend<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub(crate) fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
+    pub(crate) fn transpose(self) -> Result<Backend<'a, T, D>, E> {
         match self {
-            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
-            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
-            Self::Local(l) => Ok(BackendType::Local(l)),
+            Self::Console(c, x) => x.map(|x| Backend::Console(c, x)),
+            Self::Web(c, x) => Ok(Backend::Web(c, x)),
+            Self::Local(l) => Ok(Backend::Local(l)),
         }
     }
 }
@@ -403,12 +403,12 @@ async fn authenticate_with_secret(
     classic::authenticate(ctx, info, client, config, secret).await
 }
 
-impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
+impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get username from the credentials.
     pub(crate) fn get_user(&self) -> &str {
         match self {
             Self::Console(_, user_info) => &user_info.user,
-            Self::Link(_, ()) => "link",
+            Self::Web(_, ()) => "web",
             Self::Local(_) => "local",
         }
     }
@@ -422,7 +422,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
+    ) -> auth::Result<Backend<'a, ComputeCredentials, NodeInfo>> {
         let res = match self {
             Self::Console(api, user_info) => {
                 info!(
@@ -441,15 +441,15 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                     endpoint_rate_limiter,
                 )
                 .await?;
-                BackendType::Console(api, credentials)
+                Backend::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Self::Link(url, ()) => {
-                info!("performing link authentication");
+            Self::Web(url, ()) => {
+                info!("performing web authentication");
 
-                let info = link::authenticate(ctx, &url, client).await?;
+                let info = web::authenticate(ctx, &url, client).await?;
 
-                BackendType::Link(url, info)
+                Backend::Web(url, info)
             }
             Self::Local(_) => {
                 return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
@@ -461,14 +461,14 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     }
 }
 
-impl BackendType<'_, ComputeUserInfo, &()> {
+impl Backend<'_, ComputeUserInfo, &()> {
     pub(crate) async fn get_role_secret(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Self::Link(_, ()) => Ok(Cached::new_uncached(None)),
+            Self::Web(_, ()) => Ok(Cached::new_uncached(None)),
             Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
@@ -479,21 +479,21 @@ impl BackendType<'_, ComputeUserInfo, &()> {
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         match self {
             Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Link(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Web(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
             Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
 
 #[async_trait::async_trait]
-impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
+impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Web(_, info) => Ok(Cached::new_uncached(info.clone())),
             Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
@@ -501,21 +501,23 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
     fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
             Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Web(_, _) => &ComputeCredentialKeys::None,
             Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
 
 #[async_trait::async_trait]
-impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
+impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> {
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::Link(_, ()) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Web(_, ()) => {
+                unreachable!("web auth flow doesn't support waking the compute")
+            }
             Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
@@ -523,7 +525,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
     fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
             Self::Console(_, creds) => &creds.keys,
-            Self::Link(_, ()) => &ComputeCredentialKeys::None,
+            Self::Web(_, ()) => &ComputeCredentialKeys::None,
             Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/web.rs
similarity index 87%
rename from proxy/src/auth/backend/link.rs
rename to proxy/src/auth/backend/web.rs
index 19515f95a8..58a4bef62e 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/web.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 #[derive(Debug, Error)]
-pub(crate) enum LinkAuthError {
+pub(crate) enum WebAuthError {
     #[error(transparent)]
     WaiterRegister(#[from] waiters::RegisterError),
 
@@ -24,18 +24,18 @@ pub(crate) enum LinkAuthError {
     Io(#[from] std::io::Error),
 }
 
-impl UserFacingError for LinkAuthError {
+impl UserFacingError for WebAuthError {
     fn to_string_client(&self) -> String {
         "Internal error".to_string()
     }
 }
 
-impl ReportableError for LinkAuthError {
+impl ReportableError for WebAuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
-            LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
-            LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
-            LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
+            Self::WaiterWait(_) => crate::error::ErrorKind::Service,
+            Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
         }
     }
 }
@@ -74,7 +74,7 @@ pub(super) async fn authenticate(
         }
     };
 
-    let span = info_span!("link", psql_session_id = &psql_session_id);
+    let span = info_span!("web", psql_session_id = &psql_session_id);
     let greeting = hello_message(link_uri, &psql_session_id);
 
     // Give user a URL to spawn a new database.
@@ -87,7 +87,7 @@ pub(super) async fn authenticate(
 
     // Wait for web console response (see `mgmt`).
     info!(parent: &span, "waiting for console's reply...");
-    let db_info = waiter.await.map_err(LinkAuthError::from)?;
+    let db_info = waiter.await.map_err(WebAuthError::from)?;
 
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 3f5d006f7b..8585b8ff48 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -1,5 +1,5 @@
 //! Payload for ad hoc authentication method for clients that don't support SNI.
-//! See the `impl` for [`super::backend::BackendType<ClientCredentials>`].
+//! See the `impl` for [`super::backend::Backend<ClientCredentials>`].
 //! Read more: <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.
 
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 8acba33bac..08effeff99 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -212,7 +212,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
 
     Ok(Box::leak(Box::new(ProxyConfig {
         tls_config: None,
-        auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
+        auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
             LocalBackend::new(args.compute),
         )),
         metric_collection: None,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 1f45a33ed5..7706a1f7cd 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -60,11 +60,14 @@ use clap::{Parser, ValueEnum};
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[derive(Clone, Debug, ValueEnum)]
-enum AuthBackend {
+enum AuthBackendType {
     Console,
     #[cfg(feature = "testing")]
     Postgres,
-    Link,
+    // clap only shows the name, not the alias, in usage text.
+    // TODO: swap name/alias and deprecate "link"
+    #[value(name("link"), alias("web"))]
+    Web,
 }
 
 /// Neon proxy/router
@@ -77,8 +80,8 @@ struct ProxyCliArgs {
     /// listen for incoming client connections on ip:port
     #[clap(short, long, default_value = "127.0.0.1:4432")]
     proxy: String,
-    #[clap(value_enum, long, default_value_t = AuthBackend::Link)]
-    auth_backend: AuthBackend,
+    #[clap(value_enum, long, default_value_t = AuthBackendType::Web)]
+    auth_backend: AuthBackendType,
     /// listen for management callback connection on ip:port
     #[clap(short, long, default_value = "127.0.0.1:7000")]
     mgmt: String,
@@ -88,7 +91,7 @@ struct ProxyCliArgs {
     /// listen for incoming wss connections on ip:port
     #[clap(long)]
     wss: Option<String>,
-    /// redirect unauthenticated users to the given uri in case of link auth
+    /// redirect unauthenticated users to the given uri in case of web auth
     #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
     uri: String,
     /// cloud API endpoint for authenticating users
@@ -470,7 +473,7 @@ async fn main() -> anyhow::Result<()> {
         ));
     }
 
-    if let auth::BackendType::Console(api, _) = &config.auth_backend {
+    if let auth::Backend::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
             match (redis_notifications_client, regional_redis_client.clone()) {
                 (None, None) => {}
@@ -575,7 +578,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     }
 
     let auth_backend = match &args.auth_backend {
-        AuthBackend::Console => {
+        AuthBackendType::Console => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
@@ -624,18 +627,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                 wake_compute_endpoint_rate_limiter,
             );
             let api = console::provider::ConsoleBackend::Console(api);
-            auth::BackendType::Console(MaybeOwned::Owned(api), ())
+            auth::Backend::Console(MaybeOwned::Owned(api), ())
         }
         #[cfg(feature = "testing")]
-        AuthBackend::Postgres => {
+        AuthBackendType::Postgres => {
             let url = args.auth_endpoint.parse()?;
             let api = console::provider::mock::Api::new(url);
             let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::BackendType::Console(MaybeOwned::Owned(api), ())
+            auth::Backend::Console(MaybeOwned::Owned(api), ())
         }
-        AuthBackend::Link => {
+        AuthBackendType::Web => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(MaybeOwned::Owned(url), ())
+            auth::Backend::Web(MaybeOwned::Owned(url), ())
         }
     };
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 246501a21e..8d3cb8ee3c 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -124,13 +124,13 @@ impl ConnCfg {
     /// Apply startup message params to the connection config.
     pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
-        // Link auth flow takes username from the console's response.
+        // Web auth flow takes username from the console's response.
         if let (None, Some(user)) = (self.get_user(), params.get("user")) {
             self.user(user);
         }
 
         // Only set `dbname` if it's not present in the config.
-        // Link auth flow takes dbname from the console's response.
+        // Web auth flow takes dbname from the console's response.
         if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
             self.dbname(dbname);
         }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 6c42fb8d19..d7fc6eee22 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -25,7 +25,7 @@ use x509_parser::oid_registry;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
-    pub auth_backend: auth::BackendType<'static, (), ()>,
+    pub auth_backend: auth::Backend<'static, (), ()>,
     pub metric_collection: Option<MetricCollectionConfig>,
     pub allow_self_signed_compute: bool,
     pub http_config: HttpConfig,
@@ -247,7 +247,7 @@ impl CertResolver {
 
         let common_name = pem.subject().to_string();
 
-        // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
+        // We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as
         // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
         // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
         // and passed None instead, which blows up number of cases downstream code should handle. Proper coding
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 0df1a450ac..a48c7316f6 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -243,7 +243,7 @@ pub(crate) struct WakeCompute {
     pub(crate) aux: MetricsAuxInfo,
 }
 
-/// Async response which concludes the link auth flow.
+/// Async response which concludes the web auth flow.
 /// Also known as `kickResponse` in the console.
 #[derive(Debug, Deserialize)]
 pub(crate) struct KickSession<'a> {
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index f318ac529b..2ed4f5f206 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -25,7 +25,7 @@ pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), wai
 }
 
 /// Console management API listener task.
-/// It spawns console response handlers needed for the link auth.
+/// It spawns console response handlers needed for the web auth.
 pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
     scopeguard::defer! {
         info!("mgmt has shut down");
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 9edba543fe..72e1fa1cee 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -72,7 +72,7 @@ struct RequestMonitoringInner {
 
 #[derive(Clone, Debug)]
 pub(crate) enum AuthMethod {
-    // aka link aka passwordless
+    // aka passwordless, fka link
     Web,
     ScramSha256,
     ScramSha256Plus,
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index b3b284ef27..4264dbae0f 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -544,8 +544,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> auth::BackendType<'static, ComputeCredentials, &()> {
-    let user_info = auth::BackendType::Console(
+) -> auth::Backend<'static, ComputeCredentials, &()> {
+    let user_info = auth::Backend::Console(
         MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
         ComputeCredentials {
             info: ComputeUserInfo {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 2699411c28..f24e0478be 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -105,13 +105,13 @@ impl PoolingBackend {
         jwt: &str,
     ) -> Result<ComputeCredentials, AuthError> {
         match &self.config.auth_backend {
-            crate::auth::BackendType::Console(_, ()) => {
+            crate::auth::Backend::Console(_, ()) => {
                 Err(AuthError::auth_failed("JWT login is not yet supported"))
             }
-            crate::auth::BackendType::Link(_, ()) => Err(AuthError::auth_failed(
-                "JWT login over link proxy is not supported",
+            crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed(
+                "JWT login over web auth proxy is not supported",
             )),
-            crate::auth::BackendType::Local(cache) => {
+            crate::auth::Backend::Local(cache) => {
                 cache
                     .jwks_cache
                     .check_jwt(

From 9b9f90c562d8b78628b66e1441fea7036ed66fc8 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 27 Aug 2024 15:49:47 +0200
Subject: [PATCH 040/159] fix(walproposer): Do not restart on safekeepers
 reordering (#8840)

## Problem

Currently, we compare `neon.safekeepers` values as is, so we
unnecessarily restart walproposer even if safekeepers set didn't change.
This leads to errors like:
```log
FATAL:  [WP] restarting walproposer to change safekeeper list
from safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401
to safekeeper-11.us-east-2.aws.neon.tech:6401,safekeeper-8.us-east-2.aws.neon.tech:6401,safekeeper-10.us-east-2.aws.neon.tech:6401
```

## Summary of changes

Split the GUC into the list of individual safekeepers and properly
compare. We could've done that somewhere on the upper level, e.g.,
control plane, but I think it's still better when the actual config
consumer is smarter and doesn't rely on upper levels.
---
 pgxn/neon/walproposer_pg.c         | 69 +++++++++++++++++++++++++++++-
 test_runner/regress/test_config.py | 45 ++++++++++++++++++-
 2 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index f3ddc64061..65ef588ba5 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -220,6 +220,64 @@ nwp_register_gucs(void)
 							NULL, NULL, NULL);
 }
 
+
+static int
+split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
+{
+	int n_safekeepers = 0;
+	char *curr_sk = safekeepers_list;
+
+	for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
+	{
+		if (++n_safekeepers >= MAX_SAFEKEEPERS) {
+			wpg_log(FATAL, "too many safekeepers");
+		}
+
+		coma = strchr(coma, ',');
+		safekeepers[n_safekeepers-1] = curr_sk;
+
+		if (coma != NULL) {
+			*coma++ = '\0';
+		}
+	}
+
+	return n_safekeepers;
+}
+
+/*
+ * Accept two coma-separated strings with list of safekeeper host:port addresses.
+ * Split them into arrays and return false if two sets do not match, ignoring the order.
+ */
+static bool
+safekeepers_cmp(char *old, char *new)
+{
+	char *safekeepers_old[MAX_SAFEKEEPERS];
+	char *safekeepers_new[MAX_SAFEKEEPERS];
+	int len_old = 0;
+	int len_new = 0;
+
+	len_old = split_safekeepers_list(old, safekeepers_old);
+	len_new = split_safekeepers_list(new, safekeepers_new);
+
+	if (len_old != len_new)
+	{
+		return false;
+	}
+
+	qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp);
+	qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp);
+
+	for (int i = 0; i < len_new; i++)
+	{
+		if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0)
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
 /*
  * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
  * the list changed.
@@ -235,19 +293,26 @@ assign_neon_safekeepers(const char *newval, void *extra)
 		wpg_log(FATAL, "neon.safekeepers is empty");
 	}
 
+	/* Copy values because we will modify them in split_safekeepers_list() */
+	char *newval_copy = pstrdup(newval);
+	char *oldval = pstrdup(wal_acceptors_list);
+
 	/* 
 	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
 	 * next bgw start. We should refactor walproposer to allow graceful exit and
 	 * thus remove this delay.
+	 * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
 	 */
-	if (strcmp(wal_acceptors_list, newval) != 0)
+	if (!safekeepers_cmp(oldval, newval_copy))
 	{
 		wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
 				wal_acceptors_list, newval);
 	}
+	pfree(newval_copy);
+	pfree(oldval);
 }
 
-/*  Check if we need to suspend inserts because of lagging replication. */
+/* Check if we need to suspend inserts because of lagging replication. */
 static uint64
 backpressure_lag_impl(void)
 {
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 4bb7df1e6a..2ef28eb94b 100644
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -1,6 +1,7 @@
+import os
 from contextlib import closing
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 
 
 #
@@ -28,3 +29,45 @@ def test_config(neon_simple_env: NeonEnv):
 
             # check that config change was applied
             assert cur.fetchone() == ("debug1",)
+
+
+#
+# Test that reordering of safekeepers does not restart walproposer
+#
+def test_safekeepers_reconfigure_reorder(
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder")
+
+    endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder")
+
+    old_sks = ""
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW neon.safekeepers")
+            res = cur.fetchone()
+            assert res is not None, "neon.safekeepers GUC is set"
+            old_sks = res[0]
+
+    # Reorder safekeepers
+    safekeepers = endpoint.active_safekeepers
+    safekeepers = safekeepers[1:] + safekeepers[:1]
+
+    endpoint.reconfigure(safekeepers=safekeepers)
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW neon.safekeepers")
+            res = cur.fetchone()
+            assert res is not None, "neon.safekeepers GUC is set"
+            new_sks = res[0]
+
+            assert new_sks != old_sks, "GUC changes were applied"
+
+    log_path = os.path.join(endpoint.endpoint_path(), "compute.log")
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        # Check that walproposer was not restarted
+        assert "restarting walproposer" not in logs

From 2d10306f7a596d1f2ad8c70d01b427600a29c622 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 27 Aug 2024 18:36:33 +0300
Subject: [PATCH 041/159] Remove support for pageserver <-> compute protocol
 version 1 (#8774)

Protocol version 2 has been the default for a while now, and we no
longer have any computes running in production that used protocol
version 1. This completes the migration by removing support for v1 in
both the pageserver and the compute.

See issue #6211.
---
 libs/pageserver_api/src/models.rs           | 41 ++++--------------
 pageserver/src/metrics.rs                   |  1 -
 pageserver/src/page_service.rs              | 34 +--------------
 pgxn/neon/libpagestore.c                    |  5 +--
 pgxn/neon/pagestore_client.h                |  5 +--
 pgxn/neon/pagestore_smgr.c                  | 47 ++-------------------
 test_runner/regress/test_auth.py            |  2 +-
 test_runner/regress/test_read_validation.py | 10 -----
 8 files changed, 17 insertions(+), 128 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 4cab56771b..d39ac75707 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1063,7 +1063,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
     }
 }
 
-// In the V2 protocol version, a GetPage request contains two LSN values:
+// A GetPage request contains two LSN values:
 //
 // request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
 // "get the latest version present". It's used by the primary server, which knows that no one else
@@ -1076,7 +1076,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
 // request without waiting for 'request_lsn' to arrive.
 //
-// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
 // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
 // 'latest' was set to true. The V2 interface was added because there was no correct way for a
 // standby to request a page at a particular non-latest LSN, and also include the
@@ -1084,15 +1084,11 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // request, if the standby knows that the page hasn't been modified since, and risk getting an error
 // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
 // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
+// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
 // difference in the responses between V1 and V2.
 //
-// The Request structs below reflect the V2 interface. If V1 is used, the parse function
-// maps the old format requests to the new format.
-//
 #[derive(Clone, Copy)]
 pub enum PagestreamProtocolVersion {
-    V1,
     V2,
 }
 
@@ -1231,36 +1227,17 @@ impl PagestreamFeMessage {
         bytes.into()
     }
 
-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
         // these correspond to the NeonMessageTag enum in pagestore_client.h
         //
         // TODO: consider using protobuf or serde bincode for less error prone
         // serialization.
         let msg_tag = body.read_u8()?;
 
-        let (request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V1 => {
-                // In the old protocol, each message starts with a boolean 'latest' flag,
-                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
-                // 'not_modified_since', used in the new protocol version.
-                let latest = body.read_u8()? != 0;
-                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-                if latest {
-                    (Lsn::MAX, request_lsn) // get latest version
-                } else {
-                    (request_lsn, request_lsn) // get version at specified LSN
-                }
-            }
-        };
+        // these two fields are the same for every request type
+        let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+        let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
 
-        // The rest of the messages are the same between V1 and V2
         match msg_tag {
             0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
                 request_lsn,
@@ -1468,9 +1445,7 @@ mod tests {
         ];
         for msg in messages {
             let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
-                    .unwrap();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
             assert!(msg == reconstructed);
         }
     }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1f8634df93..c4011d593c 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1552,7 +1552,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
 #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
 pub(crate) enum ComputeCommandKind {
     PageStreamV2,
-    PageStream,
     Basebackup,
     Fullbackup,
     LeaseLsn,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 81294291a9..cb1ab70147 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -557,7 +557,7 @@ impl PageServerHandler {
         pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        protocol_version: PagestreamProtocolVersion,
+        _protocol_version: PagestreamProtocolVersion,
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -601,8 +601,7 @@ impl PageServerHandler {
             fail::fail_point!("ps::handle-pagerequest-message");
 
             // parse request
-            let neon_fe_msg =
-                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
 
             // invoke handler function
             let (handler_result, span) = match neon_fe_msg {
@@ -1275,35 +1274,6 @@ where
                 ctx,
             )
             .await?;
-        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for pagestream command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::PageStream)
-                .inc();
-
-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V1,
-                ctx,
-            )
-            .await?;
         } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 73a001b6ba..5126c26c5d 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -550,9 +550,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		case 2:
 			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
-		case 1:
-			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
 		}
@@ -1063,7 +1060,7 @@ pg_init_libpagestore(void)
 							NULL,
 							&neon_protocol_version,
 							2, /* use protocol version 2 */
-							1, /* min */
+							2, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,
 							0,	/* no flags required */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 8951e6607b..1f196d016c 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -87,9 +87,8 @@ typedef enum {
  * can skip traversing through recent layers which we know to not contain any
  * versions for the requested page.
  *
- * These structs describe the V2 of these requests. The old V1 protocol contained
- * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
- * set to 1, we will convert these to the V1 requests before sending.
+ * These structs describe the V2 of these requests. (The old now-defunct V1
+ * protocol contained just one LSN and a boolean 'latest' flag.)
  */
 typedef struct
 {
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 8edaf65639..7f39c7d026 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1001,51 +1001,10 @@ nm_pack_request(NeonRequest *msg)
 
 	initStringInfo(&s);
 
-	if (neon_protocol_version >= 2)
-	{
-		pq_sendbyte(&s, msg->tag);
-		pq_sendint64(&s, msg->lsn);
-		pq_sendint64(&s, msg->not_modified_since);
-	}
-	else
-	{
-		bool		latest;
-		XLogRecPtr	lsn;
+	pq_sendbyte(&s, msg->tag);
+	pq_sendint64(&s, msg->lsn);
+	pq_sendint64(&s, msg->not_modified_since);
 
-		/*
-		 * In primary, we always request the latest page version.
-		 */
-		if (!RecoveryInProgress())
-		{
-			latest = true;
-			lsn = msg->not_modified_since;
-		}
-		else
-		{
-			/*
-			 * In the protocol V1, we cannot represent that we want to read
-			 * page at LSN X, and we know that it hasn't been modified since
-			 * Y. We can either use 'not_modified_lsn' as the request LSN, and
-			 * risk getting an error if that LSN is too old and has already
-			 * fallen out of the pageserver's GC horizon, or we can send
-			 * 'request_lsn', causing the pageserver to possibly wait for the
-			 * recent WAL to arrive unnecessarily. Or something in between. We
-			 * choose to use the old LSN and risk GC errors, because that's
-			 * what we've done historically.
-			 */
-			latest = false;
-			lsn = msg->not_modified_since;
-		}
-
-		pq_sendbyte(&s, msg->tag);
-		pq_sendbyte(&s, latest);
-		pq_sendint64(&s, lsn);
-	}
-
-	/*
-	 * The rest of the request messages are the same between protocol V1 and
-	 * V2
-	 */
 	switch (messageTag(msg))
 	{
 			/* pagestore_client -> pagestore */
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 7cb85e3dd1..780c0e1602 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     def check_pageserver(expect_success: bool, **conn_kwargs):
         check_connection(
             env.pageserver,
-            f"pagestream {env.initial_tenant} {env.initial_timeline}",
+            f"pagestream_v2 {env.initial_tenant} {env.initial_timeline}",
             expect_success,
             **conn_kwargs,
         )
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index d128c60a99..1ac881553f 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -19,11 +19,6 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
     endpoint = env.endpoints.create_start(
         "test_read_validation",
-        # Use protocol version 2, because the code that constructs the V1 messages
-        # assumes that a primary always wants to read the latest version of a page,
-        # and therefore doesn't work with the test functions below to read an older
-        # page version.
-        config_lines=["neon.protocol_version=2"],
     )
 
     with closing(endpoint.connect()) as con:
@@ -142,11 +137,6 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
 
     endpoint = env.endpoints.create_start(
         "test_read_validation_neg",
-        # Use protocol version 2, because the code that constructs the V1 messages
-        # assumes that a primary always wants to read the latest version of a page,
-        # and therefore doesn't work with the test functions below to read an older
-        # page version.
-        config_lines=["neon.protocol_version=2"],
     )
 
     with closing(endpoint.connect()) as con:

From c5ef779801f58b7faac96db123a1c4e3b6388678 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 27 Aug 2024 19:47:05 +0300
Subject: [PATCH 042/159] tests: Remove unnecessary entries from list of
 allowed errors (#8199)

The "manual_gc" context was removed in commit be0c73f8e7. The code that
generated the other error was removed in commit 9a6c0be823.
---
 test_runner/fixtures/pageserver/allowed_errors.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index dff002bd4b..f8d9a51c91 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -52,9 +52,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Error processing HTTP request: Forbidden",
     # intentional failpoints
     ".*failpoint ",
-    # FIXME: These need investigation
-    ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
-    ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
     # Tenant::delete_timeline() can cause any of the four following errors.
     # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
     ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed

From 992a951b5e989b27d2aee118c9180c1df0b7483d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 28 Aug 2024 09:22:19 +0100
Subject: [PATCH 043/159] .github: direct feature requests to the feedback form
 (#8849)

## Problem

When folks open github issues for feature requests, they don't have a
clear recipient: engineers usually see them during bug triage, but that
doesn't necessarily get the work prioritized.

## Summary of changes

Give end users a clearer path to submitting feedback to Neon
---
 .github/ISSUE_TEMPLATE/config.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000..c8fd1209de
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,6 @@
+
+blank_issues_enabled: true
+contact_links:
+  - name: Feature request
+    url: https://console.neon.tech/app/projects?modal=feedback
+    about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`

From c0ba18a112668438e5f2de5ae04d369c48976200 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 28 Aug 2024 12:20:43 +0300
Subject: [PATCH 044/159] bench: flush before shutting down (#8844)

while driving by:
- remove the extra tenant
- remove the extra timelines

implement this by turning the pg_compare to a yielding fixture.

evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10571779162/index.html#suites/9681106e61a1222669b9d22ab136d07b/3bbe9f007b3ffae1/
---
 test_runner/fixtures/compare_fixtures.py      | 16 ++++------------
 .../performance/test_wal_backpressure.py      | 19 ++++++++++++-------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 5fe544b3bd..98a9dd7184 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -102,7 +102,6 @@ class NeonCompare(PgCompare):
         zenbenchmark: NeonBenchmarker,
         neon_simple_env: NeonEnv,
         pg_bin: PgBin,
-        branch_name: str,
     ):
         self.env = neon_simple_env
         self._zenbenchmark = zenbenchmark
@@ -110,16 +109,11 @@ class NeonCompare(PgCompare):
         self.pageserver_http_client = self.env.pageserver.http_client()
 
         # note that neon_simple_env now uses LOCAL_FS remote storage
-
-        # Create tenant
-        tenant_conf: Dict[str, str] = {}
-        self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf)
-
-        # Create timeline
-        self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant)
+        self.tenant = self.env.initial_tenant
+        self.timeline = self.env.initial_timeline
 
         # Start pg
-        self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant)
+        self._pg = self.env.endpoints.create_start("main", "main", self.tenant)
 
     @property
     def pg(self) -> PgProtocol:
@@ -297,13 +291,11 @@ class RemoteCompare(PgCompare):
 
 @pytest.fixture(scope="function")
 def neon_compare(
-    request: FixtureRequest,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
     neon_simple_env: NeonEnv,
 ) -> NeonCompare:
-    branch_name = request.node.name
-    return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name)
+    return NeonCompare(zenbenchmark, neon_simple_env, pg_bin)
 
 
 @pytest.fixture(scope="function")
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 513ebc74c3..c824e60c29 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -2,14 +2,14 @@ import statistics
 import threading
 import time
 import timeit
-from typing import Any, Callable, List
+from typing import Any, Callable, Generator, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver
 
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
@@ -20,7 +20,7 @@ from performance.test_perf_pgbench import get_durations_matrix, get_scales_matri
 # For example, to build a `NeonCompare` interface, the corresponding fixture's param should have
 # a format of `neon_{safekeepers_enable_fsync}`.
 # Note that, here "_" is used to separate builder parameters.
-def pg_compare(request) -> PgCompare:
+def pg_compare(request) -> Generator[PgCompare, None, None]:
     x = request.param.split("_")
 
     if x[0] == "vanilla":
@@ -28,7 +28,7 @@ def pg_compare(request) -> PgCompare:
         fixture = request.getfixturevalue("vanilla_compare")
         assert isinstance(fixture, VanillaCompare)
 
-        return fixture
+        yield fixture
     else:
         assert (
             len(x) == 2
@@ -47,10 +47,15 @@ def pg_compare(request) -> PgCompare:
         neon_env_builder.safekeepers_enable_fsync = x[1] == "on"
 
         env = neon_env_builder.init_start()
-        env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
 
-        branch_name = request.node.name
-        return NeonCompare(zenbenchmark, env, pg_bin, branch_name)
+        cmp = NeonCompare(zenbenchmark, env, pg_bin)
+
+        yield cmp
+
+        flush_ep_to_pageserver(env, cmp._pg, cmp.tenant, cmp.timeline)
+        env.pageserver.http_client().timeline_checkpoint(
+            cmp.tenant, cmp.timeline, compact=False, wait_until_uploaded=True
+        )
 
 
 def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int):

From 5eb7322d08e93912653dd6ba02a4507e80c50aec Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Aug 2024 14:56:14 +0100
Subject: [PATCH 045/159] docs: rolling storage controller restarts RFC (#8310)

## Problem
Storage controller upgrades (restarts, more generally) can cause
multi-second availability gaps.
While the storage controller does not sit on the main data path, it's
generally not acceptable
to block management requests for extended periods of time (e.g.
https://github.com/neondatabase/neon/issues/8034).

## Summary of changes
This RFC describes the issues around the current storage controller
restart procedure
and describes an implementation which reduces downtime to a few
milliseconds on the happy path.

Related https://github.com/neondatabase/neon/issues/7797
---
 docs/rfcs/037-storage-controller-restarts.md | 259 +++++++++++++++++++
 1 file changed, 259 insertions(+)
 create mode 100644 docs/rfcs/037-storage-controller-restarts.md

diff --git a/docs/rfcs/037-storage-controller-restarts.md b/docs/rfcs/037-storage-controller-restarts.md
new file mode 100644
index 0000000000..bad422344f
--- /dev/null
+++ b/docs/rfcs/037-storage-controller-restarts.md
@@ -0,0 +1,259 @@
+# Rolling Storage Controller Restarts
+
+## Summary
+
+This RFC describes the issues around the current storage controller restart procedure
+and describes an implementation which reduces downtime to a few milliseconds on the happy path.
+
+## Motivation
+
+Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
+While the storage controller does not sit on the main data path, it's generally not acceptable
+to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
+
+### Current Implementation
+
+The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
+In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
+a new instance is created.
+
+At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
+latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
+under unfavourable circumstances: pageservers are heavily loaded or unavailable.
+
+## Prior Art
+
+There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
+* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
+For fail-over, traffic is routed to one of the standbys (which becomes active).
+* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
+and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
+
+## Requirements
+
+* Reduce storage controller unavailability during upgrades to milliseconds
+* Minimize the interval in which it's possible for more than one storage controller
+to issue reconciles.
+* Have one uniform implementation for restarts and upgrades
+* Fit in with the current Kubernetes deployment scheme
+
+## Non Goals
+
+* Implement our own consensus algorithm from scratch
+* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
+like a transient error to the control plane
+
+## Impacted Components
+
+* storage controller
+* deployment orchestration (i.e. Ansible)
+* helm charts
+
+## Terminology
+
+* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
+at start-up by quering pageservers
+* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
+a set of replicas
+
+## Implementation
+
+### High Level Flow
+
+At a very high level the proposed idea is to start a new storage controller instance while
+the previous one is still running and cut-over to it when it becomes ready. The new instance,
+should coordinate with the existing one and transition responsibility gracefully. While the controller
+has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
+scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
+were operating at the same time and require operator intervention to remedy.
+
+### Kubernetes Deployment Configuration
+
+On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
+to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
+Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
+scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
+
+The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
+
+### Storage Controller Start-Up
+
+This section describes the primitives required on the storage controller side and the flow of the happy path.
+
+#### Database Table For Leader Synchronization
+
+A new table should be added to the storage controller database for leader synchronization during startup.
+This table will always contain at most one row. The proposed name for the table is `leader` and the schema
+contains two elements:
+* `hostname`: represents the hostname for the current storage controller leader - should be addressible
+from other pods in the deployment
+* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
+for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
+
+Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
+at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
+situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
+level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
+READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
+the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
+our needs here.
+
+```
+START TRANSACTION ISOLATION LEVEL REPEATABLE READ
+UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
+WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
+```
+
+If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
+
+#### Step Down API
+
+A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
+request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
+and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
+snapshot of the observed state.
+
+If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
+for failure scenario handling - see [Handling Failures](#handling-failures)).
+
+#### Graceful Restart Happy Path
+
+At start-up, the first thing the storage controller does is retrieve the sole row from the new
+`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
+This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
+observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
+pageservers in order to build up the observed state.
+
+Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
+section. If this step fails, the storage controller process exits.
+
+Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
+(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
+
+Summary of proposed new start-up sequence:
+1. Call `/step_down`
+2. Perform any pending database migrations
+3. Load state from database
+4. Load observed state returned in step (1) into memory
+5. Do initial heartbeat round (may be moved after 5)
+7. Mark self as leader by updating the database
+8. Reschedule and reconcile everything
+
+Some things to note from the steps above:
+* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
+calls to the pageserver and no compute notifications)
+* Ask the current leader to step down before loading state from database so we don't get a lost update
+if the transactions overlap.
+* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
+fall back to asking the pageservers about their current locations.
+* Database migrations should only run **after** the previous instance steps down (or the step down times out).
+
+
+[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
+so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
+
+### Handling Failures
+
+#### Storage Controller Crash Or Restart
+
+The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
+`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
+start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
+exists and consistency is maintained.
+
+#### Previous Leader Crashes Before New Leader Readiness
+
+When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
+reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
+(see [2]).
+
+Now we have two cases to consider:
+* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
+by Kubernetes depending on timings.
+* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
+The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
+create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
+
+[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
+should avoid this self reference and fail the API call at the client if the persisted hostname matches
+the current one.
+
+#### Previous Leader Crashes After New Leader Readiness
+
+The deployment's replica sets already satisfy the deployment's replica count requirements and the
+Kubernetes deployment rollout will just clean up the dead pod.
+
+#### New Leader Crashes Before Pasing Readiness Check
+
+The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
+with the new pod.
+
+#### Network Partition Between New Pod and Previous Leader
+
+This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
+API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
+Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
+
+### Dealing With Split Brain Scenarios
+
+As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
+duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
+scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
+The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
+
+### Ensure Leadership Before Producing Side Effects
+
+The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
+Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
+applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
+
+### Leadership Lease
+
+Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
+to be renewed periodically. Two new columns would be added to the leaders table:
+1. `last_renewed` - timestamp indicating when the lease was last renewed
+2. `lease_duration` - duration indicating the amount of time after which the lease expires
+
+The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
+same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
+to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
+
+### Notify Pageserver Of Storage Controller Term
+
+Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
+Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
+anything which contains a stale term (i.e. smaller than the current one).
+
+### Observability
+
+* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
+Per region alerts should be added on this metric which triggers when:
+  + no storage controller has been in the `Active` state for an extended period of time
+  + more than one storage controllers are in the `Active` state
+
+* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
+We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
+
+## Alternatives
+
+### Kubernetes Leases
+
+Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
+Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
+
+In our case, it would work something like this:
+* `/step_down` deletes the lease or stops it from renewing
+* lease acquisition becomes part of the start-up procedure
+
+The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
+not exactly trivial to implement.
+
+This approach has the benefit of baked in observability (`kubectl describe lease`), but:
+* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
+* More code surface than the simple "row in database" approach. Also, most of this code would be in
+a dependency not subject to code review, etc.
+* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
+so is not simple and complictes and the test set-up.
+
+To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
+to something external.

From a889a49e06101a91d548eb66d3ba1c0d89d7fb53 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 28 Aug 2024 10:54:42 -0400
Subject: [PATCH 046/159] pageserver: do vectored read on each dio-aligned
 section once (#8763)

Part of #8130, closes #8719.

## Problem

Currently, vectored blob io only coalesce blocks if they are immediately
adjacent to each other. When we switch to Direct IO, we need a way to
coalesce blobs that are within the dio-aligned boundary but has gap
between them.

## Summary of changes

- Introduces a `VectoredReadCoalesceMode` for `VectoredReadPlanner` and
`StreamingVectoredReadPlanner` which has two modes:
  - `AdjacentOnly` (current implementation)
  - `Chunked(<alignment requirement>)`
- New `ChunkedVectorBuilder` that considers batching `dio-align`-sized
read, the start and end of the vectored read will respect
`stx_dio_offset_align` / `stx_dio_mem_align` (`vectored_read.start` and
`vectored_read.blobs_at.first().start_offset` will be two different
value).
- Since we break the assumption that blobs within single `VectoredRead`
are next to each other (implicit end offset), we start to store blob end
offsets in the `VectoredRead`.
- Adapted existing tests to run in both `VectoredReadCoalesceMode`.
- The io alignment can also be live configured at runtime.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml |   4 +-
 pageserver/benches/bench_ingest.rs            |   8 +-
 pageserver/client/src/mgmt_api.rs             |  10 +
 pageserver/ctl/src/layer_map_analyzer.rs      |   7 +-
 pageserver/ctl/src/layers.rs                  |   9 +-
 pageserver/ctl/src/main.rs                    |   7 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   9 +
 pageserver/src/bin/pageserver.rs              |   7 +-
 pageserver/src/config.rs                      |  18 +
 pageserver/src/http/routes.rs                 |  17 +
 .../src/tenant/storage_layer/delta_layer.rs   |   6 +-
 .../src/tenant/storage_layer/image_layer.rs   |   2 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 352 ++++++++++++++++--
 pageserver/src/virtual_file.rs                |  61 ++-
 test_runner/fixtures/neon_fixtures.py         |  10 +
 test_runner/fixtures/parametrize.py           |   5 +
 16 files changed, 480 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 5e9fff0e6a..a8526fc6b1 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -217,7 +217,9 @@ jobs:
           ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
           for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+            for io_buffer_alignment in 0 1 512 ; do
+              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+            done
           done
 
           # Run separate tests for real S3
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index bd99f5289d..f450f46efa 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,7 +4,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::PageServerConf,
+    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
     context::{DownloadBehavior, RequestContext},
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
@@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(16384, virtual_file::io_engine_for_bench());
+    virtual_file::init(
+        16384,
+        virtual_file::io_engine_for_bench(),
+        DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index ac3ff1bb89..71d36f3113 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -506,6 +506,16 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    /// Configs io buffer alignment at runtime.
+    pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
+        let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, align)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
         let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
         self.get(uri)
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index b4bb239f44..8092c203c3 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,6 +4,7 @@
 
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -144,7 +145,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index 3611b0baab..a183a3968d 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
+use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -59,7 +60,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
     page_cache::init(100);
     let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
@@ -189,7 +190,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(
+                10,
+                virtual_file::api::IoEngineKind::StdFs,
+                DEFAULT_IO_BUFFER_ALIGNMENT,
+            );
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3fabf62987..7a6c7675bb 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,6 +20,7 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
+    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     context::{DownloadBehavior, RequestContext},
     page_cache,
     task_mgr::TaskKind,
@@ -205,7 +206,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 4992f37465..ac4a732377 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -58,6 +58,11 @@ pub(crate) struct Args {
     /// [`pageserver_api::models::virtual_file::IoEngineKind`].
     #[clap(long)]
     set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
+
+    /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
+    #[clap(long)]
+    set_io_alignment: Option<usize>,
+
     targets: Option<Vec<TenantTimelineId>>,
 }
 
@@ -124,6 +129,10 @@ async fn main_impl(
         mgmt_api_client.put_io_engine(engine_str).await?;
     }
 
+    if let Some(align) = args.set_io_alignment {
+        mgmt_api_client.put_io_alignment(align).await?;
+    }
+
     // discover targets
     let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
         &mgmt_api_client,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 7d404e50a5..850bd87b95 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
     info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
+    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
@@ -182,7 +183,11 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
+    virtual_file::init(
+        conf.max_file_descriptors,
+        conf.virtual_file_io_engine,
+        conf.io_buffer_alignment,
+    );
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 0ebaf78840..ae473bcc5f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -95,6 +95,8 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
+    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 0;
+
     ///
     /// Default built-in configuration file.
     ///
@@ -289,6 +291,8 @@ pub struct PageServerConf {
 
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
+
+    pub io_buffer_alignment: usize,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -393,6 +397,8 @@ struct PageServerConfigBuilder {
     compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
 
     virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
+
+    io_buffer_alignment: BuilderValue<usize>,
 }
 
 impl PageServerConfigBuilder {
@@ -481,6 +487,7 @@ impl PageServerConfigBuilder {
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
             virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
+            io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
         }
     }
 }
@@ -660,6 +667,10 @@ impl PageServerConfigBuilder {
         self.virtual_file_direct_io = BuilderValue::Set(value);
     }
 
+    pub fn io_buffer_alignment(&mut self, value: usize) {
+        self.io_buffer_alignment = BuilderValue::Set(value);
+    }
+
     pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -716,6 +727,7 @@ impl PageServerConfigBuilder {
                 l0_flush,
                 compact_level0_phase1_value_access,
                 virtual_file_direct_io,
+                io_buffer_alignment,
             }
             CUSTOM LOGIC
             {
@@ -985,6 +997,9 @@ impl PageServerConf {
                 "virtual_file_direct_io" => {
                     builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
                 }
+                "io_buffer_alignment" => {
+                    builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1068,6 +1083,7 @@ impl PageServerConf {
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
             virtual_file_direct_io: virtual_file::DirectIoMode::default(),
+            io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
         }
     }
 }
@@ -1308,6 +1324,7 @@ background_task_maximum_delay = '334 s'
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                 virtual_file_direct_io: virtual_file::DirectIoMode::default(),
+                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1381,6 +1398,7 @@ background_task_maximum_delay = '334 s'
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                 virtual_file_direct_io: virtual_file::DirectIoMode::default(),
+                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cbcc162b32..a126136d20 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2344,6 +2344,20 @@ async fn put_io_engine_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn put_io_alignment_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let align: usize = json_request(&mut r).await?;
+    crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
+        ApiError::PreconditionFailed(
+            format!("Requested io alignment ({align}) is not a power of two").into(),
+        )
+    })?;
+    json_response(StatusCode::OK, ())
+}
+
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -3031,6 +3045,9 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .put("/v1/io_alignment", |r| {
+            api_handler(r, put_io_alignment_handler)
+        })
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
             |r| api_handler(r, force_aux_policy_switch_handler),
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index f4a2957972..c0508e13c0 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -40,7 +40,7 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadPlanner,
+    VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
@@ -1205,6 +1205,7 @@ impl DeltaLayerInner {
         let mut prev: Option<(Key, Lsn, BlobRef)> = None;
 
         let mut read_builder: Option<VectoredReadBuilder> = None;
+        let read_mode = VectoredReadCoalesceMode::get();
 
         let max_read_size = self
             .max_vectored_read_bytes
@@ -1253,6 +1254,7 @@ impl DeltaLayerInner {
                         offsets.end.pos(),
                         meta,
                         max_read_size,
+                        read_mode,
                     ))
                 }
             } else {
@@ -2281,7 +2283,7 @@ pub(crate) mod test {
             .await
             .unwrap();
         let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
-        for max_read_size in [1, 1024] {
+        for max_read_size in [1, 2048] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 3cb2b1c83a..38411e9d9e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1367,7 +1367,7 @@ mod test {
                 .await
                 .unwrap();
         let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
-        for max_read_size in [1, 1024] {
+        for max_read_size in [1, 2048] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 54a3ad789b..80bc56092d 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -25,9 +25,10 @@ use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
+use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
@@ -60,7 +61,7 @@ pub struct VectoredBlobsBuf {
 pub struct VectoredRead {
     pub start: u64,
     pub end: u64,
-    /// Starting offsets and metadata for each blob in this read
+    /// Start offset and metadata for each blob in this read
     pub blobs_at: VecMap<u64, BlobMeta>,
 }
 
@@ -76,14 +77,109 @@ pub(crate) enum VectoredReadExtended {
     No,
 }
 
-pub(crate) struct VectoredReadBuilder {
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum VectoredReadCoalesceMode {
+    /// Only coalesce exactly adjacent reads.
+    AdjacentOnly,
+    /// In addition to adjacent reads, also consider reads whose corresponding
+    /// `end` and `start` offsets reside at the same chunk.
+    Chunked(usize),
+}
+
+impl VectoredReadCoalesceMode {
+    /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
+    /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
+    pub(crate) fn get() -> Self {
+        let align = virtual_file::get_io_buffer_alignment_raw();
+        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+            VectoredReadCoalesceMode::AdjacentOnly
+        } else {
+            VectoredReadCoalesceMode::Chunked(align)
+        }
+    }
+}
+
+pub(crate) enum VectoredReadBuilder {
+    Adjacent(AdjacentVectoredReadBuilder),
+    Chunked(ChunkedVectoredReadBuilder),
+}
+
+impl VectoredReadBuilder {
+    fn new_impl(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: Option<usize>,
+        mode: VectoredReadCoalesceMode,
+    ) -> Self {
+        match mode {
+            VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
+                AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
+            ),
+            VectoredReadCoalesceMode::Chunked(chunk_size) => {
+                Self::Chunked(ChunkedVectoredReadBuilder::new(
+                    start_offset,
+                    end_offset,
+                    meta,
+                    max_read_size,
+                    chunk_size,
+                ))
+            }
+        }
+    }
+
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+        mode: VectoredReadCoalesceMode,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
+    }
+
+    pub(crate) fn new_streaming(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        mode: VectoredReadCoalesceMode,
+    ) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, None, mode)
+    }
+
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        match self {
+            VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
+            VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
+        }
+    }
+
+    pub(crate) fn build(self) -> VectoredRead {
+        match self {
+            VectoredReadBuilder::Adjacent(builder) => builder.build(),
+            VectoredReadBuilder::Chunked(builder) => builder.build(),
+        }
+    }
+
+    pub(crate) fn size(&self) -> usize {
+        match self {
+            VectoredReadBuilder::Adjacent(builder) => builder.size(),
+            VectoredReadBuilder::Chunked(builder) => builder.size(),
+        }
+    }
+}
+
+pub(crate) struct AdjacentVectoredReadBuilder {
+    /// Start offset of the read.
     start: u64,
+    // End offset of the read.
     end: u64,
+    /// Start offset and metadata for each blob in this read
     blobs_at: VecMap<u64, BlobMeta>,
     max_read_size: Option<usize>,
 }
 
-impl VectoredReadBuilder {
+impl AdjacentVectoredReadBuilder {
     /// Start building a new vectored read.
     ///
     /// Note that by design, this does not check against reading more than `max_read_size` to
@@ -93,7 +189,7 @@ impl VectoredReadBuilder {
         start_offset: u64,
         end_offset: u64,
         meta: BlobMeta,
-        max_read_size: usize,
+        max_read_size: Option<usize>,
     ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
@@ -104,7 +200,7 @@ impl VectoredReadBuilder {
             start: start_offset,
             end: end_offset,
             blobs_at,
-            max_read_size: Some(max_read_size),
+            max_read_size,
         }
     }
     /// Attempt to extend the current read with a new blob if the start
@@ -113,13 +209,15 @@ impl VectoredReadBuilder {
     pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
         tracing::trace!(start, end, "trying to extend");
         let size = (end - start) as usize;
-        if self.end == start && {
+        let not_limited_by_max_read_size = {
             if let Some(max_read_size) = self.max_read_size {
                 self.size() + size <= max_read_size
             } else {
                 true
             }
-        } {
+        };
+
+        if self.end == start && not_limited_by_max_read_size {
             self.end = end;
             self.blobs_at
                 .append(start, meta)
@@ -144,6 +242,107 @@ impl VectoredReadBuilder {
     }
 }
 
+pub(crate) struct ChunkedVectoredReadBuilder {
+    /// Start block number
+    start_blk_no: usize,
+    /// End block number (exclusive).
+    end_blk_no: usize,
+    /// Start offset and metadata for each blob in this read
+    blobs_at: VecMap<u64, BlobMeta>,
+    max_read_size: Option<usize>,
+    /// Chunk size reads are coalesced into.
+    chunk_size: usize,
+}
+
+/// Computes x / d rounded up.
+fn div_round_up(x: usize, d: usize) -> usize {
+    (x + (d - 1)) / d
+}
+
+impl ChunkedVectoredReadBuilder {
+    /// Start building a new vectored read.
+    ///
+    /// Note that by design, this does not check against reading more than `max_read_size` to
+    /// support reading larger blobs than the configuration value. The builder will be single use
+    /// however after that.
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: Option<usize>,
+        chunk_size: usize,
+    ) -> Self {
+        let mut blobs_at = VecMap::default();
+        blobs_at
+            .append(start_offset, meta)
+            .expect("First insertion always succeeds");
+
+        let start_blk_no = start_offset as usize / chunk_size;
+        let end_blk_no = div_round_up(end_offset as usize, chunk_size);
+        Self {
+            start_blk_no,
+            end_blk_no,
+            blobs_at,
+            max_read_size,
+            chunk_size,
+        }
+    }
+
+    /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
+    ///
+    /// The resulting size also must be below the max read size.
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        tracing::trace!(start, end, "trying to extend");
+        let start_blk_no = start as usize / self.chunk_size;
+        let end_blk_no = div_round_up(end as usize, self.chunk_size);
+
+        let not_limited_by_max_read_size = {
+            if let Some(max_read_size) = self.max_read_size {
+                let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size;
+                coalesced_size <= max_read_size
+            } else {
+                true
+            }
+        };
+
+        // True if the second block starts in the same block or the immediate next block where the first block ended.
+        //
+        // Note: This automatically handles the case where two blocks are adjacent to each other,
+        // whether they starts on chunk size boundary or not.
+        let is_adjacent_chunk_read = {
+            // 1. first.end & second.start are in the same block
+            self.end_blk_no == start_blk_no + 1 ||
+            // 2. first.end ends one block before second.start
+            self.end_blk_no == start_blk_no
+        };
+
+        if is_adjacent_chunk_read && not_limited_by_max_read_size {
+            self.end_blk_no = end_blk_no;
+            self.blobs_at
+                .append(start, meta)
+                .expect("LSNs are ordered within vectored reads");
+
+            return VectoredReadExtended::Yes;
+        }
+
+        VectoredReadExtended::No
+    }
+
+    pub(crate) fn size(&self) -> usize {
+        (self.end_blk_no - self.start_blk_no) * self.chunk_size
+    }
+
+    pub(crate) fn build(self) -> VectoredRead {
+        let start = (self.start_blk_no * self.chunk_size) as u64;
+        let end = (self.end_blk_no * self.chunk_size) as u64;
+        VectoredRead {
+            start,
+            end,
+            blobs_at: self.blobs_at,
+        }
+    }
+}
+
 #[derive(Copy, Clone, Debug)]
 pub enum BlobFlag {
     None,
@@ -166,14 +365,18 @@ pub struct VectoredReadPlanner {
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
     max_read_size: usize,
+
+    mode: VectoredReadCoalesceMode,
 }
 
 impl VectoredReadPlanner {
     pub fn new(max_read_size: usize) -> Self {
+        let mode = VectoredReadCoalesceMode::get();
         Self {
             blobs: BTreeMap::new(),
             prev: None,
             max_read_size,
+            mode,
         }
     }
 
@@ -252,6 +455,7 @@ impl VectoredReadPlanner {
                         end_offset,
                         BlobMeta { key, lsn },
                         self.max_read_size,
+                        self.mode,
                     );
 
                     let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -303,6 +507,18 @@ impl<'a> VectoredBlobReader<'a> {
             read.size(),
             buf.capacity()
         );
+
+        if cfg!(debug_assertions) {
+            let align = virtual_file::get_io_buffer_alignment() as u64;
+            debug_assert_eq!(
+                read.start % align,
+                0,
+                "Read start at {} does not satisfy the required io buffer alignment ({} bytes)",
+                read.start,
+                align
+            );
+        }
+
         let mut buf = self
             .file
             .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
@@ -310,27 +526,20 @@ impl<'a> VectoredBlobReader<'a> {
             .into_inner();
 
         let blobs_at = read.blobs_at.as_slice();
-        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
+
+        let start_offset = read.start;
 
         let mut metas = Vec::with_capacity(blobs_at.len());
-
         // Blobs in `read` only provide their starting offset. The end offset
         // of a blob is implicit: the start of the next blob if one exists
         // or the end of the read.
-        let pairs = blobs_at.iter().zip(
-            blobs_at
-                .iter()
-                .map(Some)
-                .skip(1)
-                .chain(std::iter::once(None)),
-        );
 
         // Some scratch space, put here for reusing the allocation
         let mut decompressed_vec = Vec::new();
 
-        for ((offset, meta), next) in pairs {
-            let offset_in_buf = offset - start_offset;
-            let first_len_byte = buf[offset_in_buf as usize];
+        for (blob_start, meta) in blobs_at {
+            let blob_start_in_buf = blob_start - start_offset;
+            let first_len_byte = buf[blob_start_in_buf as usize];
 
             // Each blob is prefixed by a header containing its size and compression information.
             // Extract the size and skip that header to find the start of the data.
@@ -340,7 +549,7 @@ impl<'a> VectoredBlobReader<'a> {
                 (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
             } else {
                 let mut blob_size_buf = [0u8; 4];
-                let offset_in_buf = offset_in_buf as usize;
+                let offset_in_buf = blob_start_in_buf as usize;
 
                 blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
                 blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
@@ -353,12 +562,8 @@ impl<'a> VectoredBlobReader<'a> {
                 )
             };
 
-            let start_raw = offset_in_buf + size_length;
-            let end_raw = match next {
-                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
-                None => start_raw + blob_size,
-            };
-            assert_eq!(end_raw - start_raw, blob_size);
+            let start_raw = blob_start_in_buf + size_length;
+            let end_raw = start_raw + blob_size;
             let (start, end);
             if compression_bits == BYTE_UNCOMPRESSED {
                 start = start_raw as usize;
@@ -407,18 +612,22 @@ pub struct StreamingVectoredReadPlanner {
     max_cnt: usize,
     /// Size of the current batch
     cnt: usize,
+
+    mode: VectoredReadCoalesceMode,
 }
 
 impl StreamingVectoredReadPlanner {
     pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
         assert!(max_cnt > 0);
         assert!(max_read_size > 0);
+        let mode = VectoredReadCoalesceMode::get();
         Self {
             read_builder: None,
             prev: None,
             max_cnt,
             max_read_size,
             cnt: 0,
+            mode,
         }
     }
 
@@ -467,17 +676,12 @@ impl StreamingVectoredReadPlanner {
             }
             None => {
                 self.read_builder = {
-                    let mut blobs_at = VecMap::default();
-                    blobs_at
-                        .append(start_offset, BlobMeta { key, lsn })
-                        .expect("First insertion always succeeds");
-
-                    Some(VectoredReadBuilder {
-                        start: start_offset,
-                        end: end_offset,
-                        blobs_at,
-                        max_read_size: None,
-                    })
+                    Some(VectoredReadBuilder::new_streaming(
+                        start_offset,
+                        end_offset,
+                        BlobMeta { key, lsn },
+                        self.mode,
+                    ))
                 };
             }
         }
@@ -511,7 +715,9 @@ mod tests {
     use super::*;
 
     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
-        assert_eq!(read.start, offset_range.first().unwrap().2);
+        let align = virtual_file::get_io_buffer_alignment() as u64;
+        assert_eq!(read.start % align, 0);
+        assert_eq!(read.start / align, offset_range.first().unwrap().2 / align);
 
         let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
 
@@ -525,6 +731,63 @@ mod tests {
         assert_eq!(expected_offsets_in_read, offsets_in_read);
     }
 
+    #[test]
+    fn planner_chunked_coalesce_all_test() {
+        use crate::virtual_file;
+
+        const CHUNK_SIZE: u64 = 512;
+        virtual_file::set_io_buffer_alignment(CHUNK_SIZE as usize).unwrap();
+        let max_read_size = CHUNK_SIZE as usize * 8;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = [
+            (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
+            (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
+            (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
+            (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
+            (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
+        ];
+
+        let ranges = [
+            &[
+                blob_descriptions[0],
+                blob_descriptions[2],
+                blob_descriptions[4],
+                blob_descriptions[5],
+                blob_descriptions[7],
+                blob_descriptions[8],
+                blob_descriptions[10],
+            ],
+            &blob_descriptions[11..12],
+            &blob_descriptions[13..],
+        ];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(652 * 1024);
+
+        let reads = planner.finish();
+
+        assert_eq!(reads.len(), ranges.len());
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
     #[test]
     fn planner_max_read_size_test() {
         let max_read_size = 128 * 1024;
@@ -737,6 +1000,7 @@ mod tests {
         let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
         let mut buf = BytesMut::with_capacity(reserved_bytes);
 
+        let mode = VectoredReadCoalesceMode::get();
         let vectored_blob_reader = VectoredBlobReader::new(&file);
         let meta = BlobMeta {
             key: Key::MIN,
@@ -748,7 +1012,7 @@ mod tests {
             if idx + 1 == offsets.len() {
                 continue;
             }
-            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
+            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
             let read = read_builder.build();
             let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
             assert_eq!(result.blobs.len(), 1);
@@ -784,4 +1048,12 @@ mod tests {
         round_trip_test_compressed(&blobs, true).await?;
         Ok(())
     }
+
+    #[test]
+    fn test_div_round_up() {
+        const CHUNK_SIZE: usize = 512;
+        assert_eq!(1, div_round_up(200, CHUNK_SIZE));
+        assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
+        assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
+    }
 }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index c0017280fd..4b11dc1a94 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,6 +10,7 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
+use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
@@ -1140,10 +1141,13 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind) {
+pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    if set_io_buffer_alignment(io_buffer_alignment).is_err() {
+        panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
+    }
     io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1167,6 +1171,61 @@ fn get_open_files() -> &'static OpenFiles {
     }
 }
 
+static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
+
+/// Returns true if `x` is zero or a power of two.
+fn is_zero_or_power_of_two(x: usize) -> bool {
+    (x == 0) || ((x & (x - 1)) == 0)
+}
+
+#[allow(unused)]
+pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
+    if is_zero_or_power_of_two(align) {
+        IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
+        Ok(())
+    } else {
+        Err(align)
+    }
+}
+
+/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
+///
+/// This function should be used to check the raw config value.
+pub(crate) fn get_io_buffer_alignment_raw() -> usize {
+    let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
+
+    if cfg!(test) {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
+        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+            if let Some(test_align) = utils::env::var(env_var_name) {
+                if is_zero_or_power_of_two(test_align) {
+                    test_align
+                } else {
+                    panic!("IO buffer alignment ({test_align}) is not a power of two");
+                }
+            } else {
+                crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT
+            }
+        } else {
+            align
+        }
+    } else {
+        align
+    }
+}
+
+/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
+///
+/// This function should be used for getting the actual alignment value to use.
+pub(crate) fn get_io_buffer_alignment() -> usize {
+    let align = get_io_buffer_alignment_raw();
+    if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+        1
+    } else {
+        align
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::context::DownloadBehavior;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 92febfec9b..69a4234617 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -496,6 +496,7 @@ class NeonEnvBuilder:
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
         storage_controller_port_override: Optional[int] = None,
+        pageserver_io_buffer_alignment: Optional[int] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -550,6 +551,8 @@ class NeonEnvBuilder:
 
         self.storage_controller_port_override = storage_controller_port_override
 
+        self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1123,6 +1126,7 @@ class NeonEnv:
 
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
+        self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment
 
         # Create the neon_local's `NeonLocalInitConf`
         cfg: Dict[str, Any] = {
@@ -1184,6 +1188,8 @@ class NeonEnv:
                         for key, value in override.items():
                             ps_cfg[key] = value
 
+            ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
@@ -1425,6 +1431,7 @@ def _shared_simple_env(
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore],
     pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
+    pageserver_io_buffer_alignment: Optional[int],
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1457,6 +1464,7 @@ def _shared_simple_env(
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
+        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
     ) as builder:
         env = builder.init_start()
 
@@ -1499,6 +1507,7 @@ def neon_env_builder(
     pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
     pageserver_aux_file_policy: Optional[AuxFileStore],
     record_property: Callable[[str, object], None],
+    pageserver_io_buffer_alignment: Optional[int],
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1534,6 +1543,7 @@ def neon_env_builder(
         test_overlay_dir=test_overlay_dir,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
+        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
     ) as builder:
         yield builder
         # Propogate `preserve_database_files` to make it possible to use in other fixtures,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 92c98763e3..e2dd51802c 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -34,6 +34,11 @@ def pageserver_virtual_file_io_engine() -> Optional[str]:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
 
 
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_io_buffer_alignment() -> Optional[int]:
+    return None
+
+
 @pytest.fixture(scope="function", autouse=True)
 def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
     return None

From 793b5061ecb2ab20250c762248afd24e23ed1e16 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 28 Aug 2024 18:23:55 +0100
Subject: [PATCH 047/159] storcon: track pageserver availability zone (#8852)

## Problem
In order to build AZ aware scheduling, the storage controller needs to
know what AZ pageservers are in.

Related https://github.com/neondatabase/neon/issues/8848

## Summary of changes
This patch set adds a new nullable column to the `nodes` table:
`availability_zone_id`. The node registration
request is extended to include the AZ id (pageservers already have this
in their `metadata.json` file).

If the node is already registered, then we update the persistent and
in-memory state with the provided AZ.
Otherwise, we add the node with the AZ to begin with.

A couple assumptions are made here:
1. Pageserver AZ ids are stable
2. AZ ids do not change over time

Once all pageservers have a configured AZ, we can remove the optionals
in the code and make the database column not nullable.
---
 control_plane/storcon_cli/src/main.rs         |  4 +
 libs/pageserver_api/src/controller_api.rs     |  2 +
 pageserver/src/control_plane_client.rs        |  6 ++
 .../2024-08-27-184400_pageserver_az/down.sql  |  1 +
 .../2024-08-27-184400_pageserver_az/up.sql    |  1 +
 storage_controller/src/node.rs                | 23 ++++-
 storage_controller/src/persistence.rs         | 27 ++++++
 storage_controller/src/scheduler.rs           |  1 +
 storage_controller/src/schema.rs              |  1 +
 storage_controller/src/service.rs             | 93 +++++++++++++++----
 .../fixtures/pageserver/allowed_errors.py     |  3 +
 11 files changed, 143 insertions(+), 19 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql
 create mode 100644 storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 35510ccbca..5cce6cf3ae 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -41,6 +41,8 @@ enum Command {
         listen_http_addr: String,
         #[arg(long)]
         listen_http_port: u16,
+        #[arg(long)]
+        availability_zone_id: String,
     },
 
     /// Modify a node's configuration in the storage controller
@@ -322,6 +324,7 @@ async fn main() -> anyhow::Result<()> {
             listen_pg_port,
             listen_http_addr,
             listen_http_port,
+            availability_zone_id,
         } => {
             storcon_client
                 .dispatch::<_, ()>(
@@ -333,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
+                        availability_zone_id: Some(availability_zone_id),
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a9a57d77ce..345abd69b6 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -56,6 +56,8 @@ pub struct NodeRegisterRequest {
 
     pub listen_http_addr: String,
     pub listen_http_port: u16,
+
+    pub availability_zone_id: Option<String>,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index b5d9267d79..56a536c387 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,12 +141,18 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         m.other
                     );
 
+                    let az_id = m
+                        .other
+                        .get("availability_zone_id")
+                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+
                     Some(NodeRegisterRequest {
                         node_id: conf.id,
                         listen_pg_addr: m.postgres_host,
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
+                        availability_zone_id: az_id,
                     })
                 }
                 Err(e) => {
diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql
new file mode 100644
index 0000000000..22df81c83c
--- /dev/null
+++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes DROP availability_zone_id;
diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql
new file mode 100644
index 0000000000..7112f92bf2
--- /dev/null
+++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ADD availability_zone_id VARCHAR;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 61a44daca9..73cecc491d 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -36,6 +36,8 @@ pub(crate) struct Node {
     listen_pg_addr: String,
     listen_pg_port: u16,
 
+    availability_zone_id: Option<String>,
+
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
     #[serde(skip)]
@@ -61,6 +63,10 @@ impl Node {
         self.id
     }
 
+    pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
+        self.availability_zone_id.as_deref()
+    }
+
     pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
         self.scheduling
     }
@@ -72,7 +78,18 @@ impl Node {
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
-        self.id == register_req.node_id
+        let az_ids_match = {
+            match (
+                self.availability_zone_id.as_deref(),
+                register_req.availability_zone_id.as_deref(),
+            ) {
+                (Some(current_az), Some(register_req_az)) => current_az == register_req_az,
+                _ => true,
+            }
+        };
+
+        az_ids_match
+            && self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
             && self.listen_pg_addr == register_req.listen_pg_addr
@@ -173,6 +190,7 @@ impl Node {
         listen_http_port: u16,
         listen_pg_addr: String,
         listen_pg_port: u16,
+        availability_zone_id: Option<String>,
     ) -> Self {
         Self {
             id,
@@ -182,6 +200,7 @@ impl Node {
             listen_pg_port,
             scheduling: NodeSchedulingPolicy::Active,
             availability: NodeAvailability::Offline,
+            availability_zone_id,
             cancel: CancellationToken::new(),
         }
     }
@@ -194,6 +213,7 @@ impl Node {
             listen_http_port: self.listen_http_port as i32,
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port as i32,
+            availability_zone_id: self.availability_zone_id.clone(),
         }
     }
 
@@ -208,6 +228,7 @@ impl Node {
             listen_http_port: np.listen_http_port as u16,
             listen_pg_addr: np.listen_pg_addr,
             listen_pg_port: np.listen_pg_port as u16,
+            availability_zone_id: np.availability_zone_id,
             cancel: CancellationToken::new(),
         }
     }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 1a905753a1..a842079ce7 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -103,6 +103,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
+    SetNodeAzId,
 }
 
 #[must_use]
@@ -315,6 +316,31 @@ impl Persistence {
         }
     }
 
+    pub(crate) async fn set_node_availability_zone_id(
+        &self,
+        input_node_id: NodeId,
+        input_az_id: String,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        let updated = self
+            .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| {
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((availability_zone_id.eq(input_az_id.clone()),))
+                    .execute(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        if updated != 1 {
+            Err(DatabaseError::Logical(format!(
+                "Node {node_id:?} not found for setting az id",
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -974,6 +1000,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_http_port: i32,
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
+    pub(crate) availability_zone_id: Option<String>,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 060e3cc6ca..ef4da6861c 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -528,6 +528,7 @@ pub(crate) mod test_utils {
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
+                        None,
                     );
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 77ba47e114..1e8379500c 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -25,6 +25,7 @@ diesel::table! {
         listen_http_port -> Int4,
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
+        availability_zone_id -> Nullable<Varchar>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 7daa1e4f5f..1f221a9b45 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1257,6 +1257,7 @@ impl Service {
                     123,
                     "".to_string(),
                     123,
+                    None,
                 );
 
                 scheduler.node_upsert(&node);
@@ -4683,29 +4684,84 @@ impl Service {
         )
         .await;
 
-        {
+        if register_req.availability_zone_id.is_none() {
+            tracing::warn!(
+                "Node {} registering without specific availability zone id",
+                register_req.node_id
+            );
+        }
+
+        enum RegistrationStatus {
+            Matched(Node),
+            Mismatched,
+            New,
+        }
+
+        let registration_status = {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
-                // Note that we do not do a total equality of the struct, because we don't require
-                // the availability/scheduling states to agree for a POST to be idempotent.
                 if node.registration_match(&register_req) {
-                    tracing::info!(
-                        "Node {} re-registered with matching address",
-                        register_req.node_id
-                    );
-                    return Ok(());
+                    RegistrationStatus::Matched(node.clone())
                 } else {
-                    // TODO: decide if we want to allow modifying node addresses without removing and re-adding
-                    // the node.  Safest/simplest thing is to refuse it, and usually we deploy with
-                    // a fixed address through the lifetime of a node.
-                    tracing::warn!(
-                        "Node {} tried to register with different address",
-                        register_req.node_id
-                    );
-                    return Err(ApiError::Conflict(
-                        "Node is already registered with different address".to_string(),
-                    ));
+                    RegistrationStatus::Mismatched
                 }
+            } else {
+                RegistrationStatus::New
+            }
+        };
+
+        match registration_status {
+            RegistrationStatus::Matched(node) => {
+                tracing::info!(
+                    "Node {} re-registered with matching address",
+                    register_req.node_id
+                );
+
+                if node.get_availability_zone_id().is_none() {
+                    if let Some(az_id) = register_req.availability_zone_id.clone() {
+                        tracing::info!("Extracting availability zone id from registration request for node {}: {}",
+                                       register_req.node_id, az_id);
+
+                        // Persist to the database and update in memory state. See comment below
+                        // on ordering.
+                        self.persistence
+                            .set_node_availability_zone_id(register_req.node_id, az_id)
+                            .await?;
+                        let node_with_az = Node::new(
+                            register_req.node_id,
+                            register_req.listen_http_addr,
+                            register_req.listen_http_port,
+                            register_req.listen_pg_addr,
+                            register_req.listen_pg_port,
+                            register_req.availability_zone_id,
+                        );
+
+                        let mut locked = self.inner.write().unwrap();
+                        let mut new_nodes = (*locked.nodes).clone();
+
+                        locked.scheduler.node_upsert(&node_with_az);
+                        new_nodes.insert(register_req.node_id, node_with_az);
+
+                        locked.nodes = Arc::new(new_nodes);
+                    }
+                }
+
+                return Ok(());
+            }
+            RegistrationStatus::Mismatched => {
+                // TODO: decide if we want to allow modifying node addresses without removing and re-adding
+                // the node.  Safest/simplest thing is to refuse it, and usually we deploy with
+                // a fixed address through the lifetime of a node.
+                tracing::warn!(
+                    "Node {} tried to register with different address",
+                    register_req.node_id
+                );
+                return Err(ApiError::Conflict(
+                    "Node is already registered with different address".to_string(),
+                ));
+            }
+            RegistrationStatus::New => {
+                // fallthrough
             }
         }
 
@@ -4742,6 +4798,7 @@ impl Service {
             register_req.listen_http_port,
             register_req.listen_pg_addr,
             register_req.listen_pg_port,
+            register_req.availability_zone_id,
         );
 
         // TODO: idempotency if the node already exists in the database
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index f8d9a51c91..70f2676245 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -109,6 +109,9 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # controller's attempts to notify the endpoint).
     ".*reconciler.*neon_local notification hook failed.*",
     ".*reconciler.*neon_local error.*",
+    # Neon local does not provide pageserver with an AZ
+    # TODO: remove this once neon local does so
+    ".*registering without specific availability zone id.*",
 ]
 
 
From 63a0d0d0397218ed9e830a35d8939da28ad5b6ee Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 29 Aug 2024 01:39:21 +0800
Subject: [PATCH 048/159] fix(storage-scrubber): make retry error into warnings
 (#8851)

We get many HTTP connect timeout errors from scrubber logs, and it
turned out that the scrubber is retrying, and this is not an actual
error. In the future, we should revisit all places where we log errors
in the storage scrubber, and only error when necessary (i.e., errors
that might need manual fixing)

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 112f052e07..3c21d2f8cf 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -36,7 +36,7 @@ use serde::{Deserialize, Serialize};
 use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
-use tracing::error;
+use tracing::{error, warn};
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 use utils::fs_ext;
@@ -466,7 +466,7 @@ async fn list_objects_with_retries(
                     return Err(e)
                         .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
                 }
-                error!(
+                warn!(
                     "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
                     s3_target.bucket_name,
                     s3_target.prefix_in_bucket,

From 9627747d35dd9a5b7ceec099eb8f9604a95408dc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 28 Aug 2024 20:31:41 +0200
Subject: [PATCH 049/159] bypass `PageCache` for `InMemoryLayer` + avoid
 `Value::deser` on L0 flush (#8537)

Part of [Epic: Bypass PageCache for user data
blocks](https://github.com/neondatabase/neon/issues/7386).

# Problem

`InMemoryLayer` still uses the `PageCache` for all data stored in the
`VirtualFile` that underlies the `EphemeralFile`.

# Background

Before this PR, `EphemeralFile` is a fancy and (code-bloated) buffered
writer around a `VirtualFile` that supports `blob_io`.

The `InMemoryLayerInner::index` stores offsets into the `EphemeralFile`.
At those offset, we find a varint length followed by the serialized
`Value`.

Vectored reads (`get_values_reconstruct_data`) are not in fact vectored
- each `Value` that needs to be read is read sequentially.

The `will_init` bit of information which we use to early-exit the
`get_values_reconstruct_data` for a given key is stored in the
serialized `Value`, meaning we have to read & deserialize the `Value`
from the `EphemeralFile`.

The L0 flushing **also** needs to re-determine the `will_init` bit of
information, by deserializing each value during L0 flush.

# Changes

1. Store the value length and `will_init` information in the
`InMemoryLayer::index`. The `EphemeralFile` thus only needs to store the
values.
2. For `get_values_reconstruct_data`:
- Use the in-memory `index` figures out which values need to be read.
Having the `will_init` stored in the index enables us to do that.
- View the EphemeralFile as a byte array of "DIO chunks", each 512 bytes
in size (adjustable constant). A "DIO chunk" is the minimal unit that we
can read under direct IO.
- Figure out which chunks need to be read to retrieve the serialized
bytes for thes values we need to read.
- Coalesce chunk reads such that each DIO chunk is only read once to
serve all value reads that need data from that chunk.
- Merge adjacent chunk reads into larger
`EphemeralFile::read_exact_at_eof_ok` of up to 128k (adjustable
constant).
3. The new `EphemeralFile::read_exact_at_eof_ok` fills the IO buffer
from the underlying VirtualFile and/or its in-memory buffer.
4. The L0 flush code is changed to use the `index` directly, `blob_io`
5. We can remove the `ephemeral_file::page_caching` construct now.

The `get_values_reconstruct_data` changes seem like a bit overkill but
they are necessary so we issue the equivalent amount of read system
calls compared to before this PR where it was highly likely that even if
the first PageCache access was a miss, remaining reads within the same
`get_values_reconstruct_data` call from the same `EphemeralFile` page
were a hit.

The "DIO chunk" stuff is truly unnecessary for page cache bypass, but,
since we're working on [direct
IO](https://github.com/neondatabase/neon/issues/8130) and
https://github.com/neondatabase/neon/issues/8719 specifically, we need
to do _something_ like this anyways in the near future.

# Alternative Design

The original plan was to use the `vectored_blob_io` code it relies on
the invariant of Delta&Image layers that `index order == values order`.

Further, `vectored_blob_io` code's strategy for merging IOs is limited
to adjacent reads. However, with direct IO, there is another level of
merging that should be done, specifically, if multiple reads map to the
same "DIO chunk" (=alignment-requirement-sized and -aligned region of
the file), then it's "free" to read the chunk into an IO buffer and
serve the two reads from that buffer.
=> https://github.com/neondatabase/neon/issues/8719

# Testing / Performance

Correctness of the IO merging code is ensured by unit tests.

Additionally, minimal tests are added for the `EphemeralFile`
implementation and the bit-packed `InMemoryLayerIndexValue`.

Performance testing results are presented below.
All pref testing done on my M2 MacBook Pro, running a Linux VM.
It's a release build without `--features testing`.

We see definitive improvement in ingest performance microbenchmark and
an ad-hoc microbenchmark for getpage against InMemoryLayer.

```
baseline: commit 7c74112b2a6e23c07bfd9cc62c240cd6bbdd3bd9 origin/main
HEAD: ef1c55c52e0c313be4d302794d29534591f9cdc5
```

<details>

```
cargo bench --bench bench_ingest -- 'ingest 128MB/100b seq, no delta'

baseline

ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [483.50 ms 498.73 ms 522.53 ms]
                        thrpt:  [244.96 MiB/s 256.65 MiB/s 264.73 MiB/s]

HEAD

ingest-small-values/ingest 128MB/100b seq, no delta
                        time:   [479.22 ms 482.92 ms 487.35 ms]
                        thrpt:  [262.64 MiB/s 265.06 MiB/s 267.10 MiB/s]
```

</details>

We don't have a micro-benchmark for InMemoryLayer and it's quite
cumbersome to add one. So, I did manual testing in `neon_local`.

<details>

```

  ./target/release/neon_local stop
  rm -rf .neon
  ./target/release/neon_local init
  ./target/release/neon_local start
  ./target/release/neon_local tenant create --set-default
  ./target/release/neon_local endpoint create foo
  ./target/release/neon_local endpoint start foo
  psql 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
psql (13.16 (Debian 13.16-0+deb11u1), server 15.7)

CREATE TABLE wal_test (
    id SERIAL PRIMARY KEY,
    data TEXT
);

DO $$
DECLARE
    i INTEGER := 1;
BEGIN
    WHILE i <= 500000 LOOP
        INSERT INTO wal_test (data) VALUES ('data');
        i := i + 1;
    END LOOP;
END $$;

-- => result is one L0 from initdb and one 137M-sized ephemeral-2

DO $$
DECLARE
    i INTEGER := 1;
    random_id INTEGER;
    random_record wal_test%ROWTYPE;
    start_time TIMESTAMP := clock_timestamp();
    selects_completed INTEGER := 0;
    min_id INTEGER := 1;  -- Minimum ID value
    max_id INTEGER := 100000;  -- Maximum ID value, based on your insert range
    iters INTEGER := 100000000;  -- Number of iterations to run
BEGIN
    WHILE i <= iters LOOP
        -- Generate a random ID within the known range
        random_id := min_id + floor(random() * (max_id - min_id + 1))::int;

        -- Select the row with the generated random ID
        SELECT * INTO random_record
        FROM wal_test
        WHERE id = random_id;

        -- Increment the select counter
        selects_completed := selects_completed + 1;

        -- Check if a second has passed
        IF EXTRACT(EPOCH FROM clock_timestamp() - start_time) >= 1 THEN
            -- Print the number of selects completed in the last second
            RAISE NOTICE 'Selects completed in last second: %', selects_completed;

            -- Reset counters for the next second
            selects_completed := 0;
            start_time := clock_timestamp();
        END IF;

        -- Increment the loop counter
        i := i + 1;
    END LOOP;
END $$;

./target/release/neon_local stop

baseline: commit 7c74112b2a6e23c07bfd9cc62c240cd6bbdd3bd9 origin/main

NOTICE:  Selects completed in last second: 1864
NOTICE:  Selects completed in last second: 1850
NOTICE:  Selects completed in last second: 1851
NOTICE:  Selects completed in last second: 1918
NOTICE:  Selects completed in last second: 1911
NOTICE:  Selects completed in last second: 1879
NOTICE:  Selects completed in last second: 1858
NOTICE:  Selects completed in last second: 1827
NOTICE:  Selects completed in last second: 1933

ours

NOTICE:  Selects completed in last second: 1915
NOTICE:  Selects completed in last second: 1928
NOTICE:  Selects completed in last second: 1913
NOTICE:  Selects completed in last second: 1932
NOTICE:  Selects completed in last second: 1846
NOTICE:  Selects completed in last second: 1955
NOTICE:  Selects completed in last second: 1991
NOTICE:  Selects completed in last second: 1973
```

NB: the ephemeral file sizes differ by ca 1MiB, ours being 1MiB smaller.

</details>

# Rollout

This PR changes the code in-place and  is not gated by a feature flag.
---
 Cargo.lock                                    |  14 +
 Cargo.toml                                    |   2 +
 pageserver/Cargo.toml                         |   2 +
 pageserver/benches/bench_ingest.rs            |   4 +-
 pageserver/src/assert_u64_eq_usize.rs         |  39 +
 pageserver/src/config.rs                      |  10 +
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/tenant.rs                      |   6 +
 pageserver/src/tenant/blob_io.rs              |   4 +-
 pageserver/src/tenant/block_io.rs             |  23 -
 pageserver/src/tenant/ephemeral_file.rs       | 430 +++++---
 .../src/tenant/ephemeral_file/page_caching.rs | 153 ---
 .../ephemeral_file/zero_padded_read_write.rs  | 145 ---
 .../zero_padded_read_write/zero_padded.rs     | 110 --
 .../src/tenant/storage_layer/delta_layer.rs   |   6 +-
 .../tenant/storage_layer/inmemory_layer.rs    | 509 ++++++++--
 .../inmemory_layer/vectored_dio_read.rs       | 937 ++++++++++++++++++
 pageserver/src/tenant/timeline.rs             |   6 +-
 .../virtual_file/owned_buffers_io/write.rs    |   1 +
 .../regress/test_pageserver_layer_rolling.py  |   9 +-
 20 files changed, 1757 insertions(+), 654 deletions(-)
 create mode 100644 pageserver/src/assert_u64_eq_usize.rs
 delete mode 100644 pageserver/src/tenant/ephemeral_file/page_caching.rs
 delete mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
 delete mode 100644 pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
 create mode 100644 pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs

diff --git a/Cargo.lock b/Cargo.lock
index 441ca1ff86..c514625518 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -936,6 +936,12 @@ dependencies = [
  "which",
 ]
 
+[[package]]
+name = "bit_field"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -3683,6 +3689,7 @@ dependencies = [
  "async-compression",
  "async-stream",
  "async-trait",
+ "bit_field",
  "byteorder",
  "bytes",
  "camino",
@@ -3732,6 +3739,7 @@ dependencies = [
  "reqwest 0.12.4",
  "rpds",
  "scopeguard",
+ "send-future",
  "serde",
  "serde_json",
  "serde_path_to_error",
@@ -5455,6 +5463,12 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
+[[package]]
+name = "send-future"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
+
 [[package]]
 name = "sentry"
 version = "0.32.3"
diff --git a/Cargo.toml b/Cargo.toml
index e038c0b4ff..7bd9a26394 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,6 +65,7 @@ axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
+bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
@@ -145,6 +146,7 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
+send-future = "0.1.0"
 sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 0e748ee3db..85c5e24afc 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -16,6 +16,7 @@ arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
+bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
@@ -52,6 +53,7 @@ rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
+send-future.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index f450f46efa..1be4391d81 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -103,13 +103,13 @@ async fn ingest(
         batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
         if batch.len() >= BATCH_SIZE {
             let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch);
+            let serialized = SerializedBatch::from_values(this_batch).unwrap();
             layer.put_batch(serialized, &ctx).await?;
         }
     }
     if !batch.is_empty() {
         let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch);
+        let serialized = SerializedBatch::from_values(this_batch).unwrap();
         layer.put_batch(serialized, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs
new file mode 100644
index 0000000000..66ca7fd057
--- /dev/null
+++ b/pageserver/src/assert_u64_eq_usize.rs
@@ -0,0 +1,39 @@
+//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
+
+pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
+    if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
+        panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
+    }
+};
+
+pub(crate) trait U64IsUsize {
+    fn into_usize(self) -> usize;
+}
+
+impl U64IsUsize for u64 {
+    #[inline(always)]
+    fn into_usize(self) -> usize {
+        #[allow(clippy::let_unit_value)]
+        let _ = _ASSERT_U64_EQ_USIZE;
+        self as usize
+    }
+}
+
+pub(crate) trait UsizeIsU64 {
+    fn into_u64(self) -> u64;
+}
+
+impl UsizeIsU64 for usize {
+    #[inline(always)]
+    fn into_u64(self) -> u64 {
+        #[allow(clippy::let_unit_value)]
+        let _ = _ASSERT_U64_EQ_USIZE;
+        self as u64
+    }
+}
+
+pub const fn u64_to_usize(x: u64) -> usize {
+    #[allow(clippy::let_unit_value)]
+    let _ = _ASSERT_U64_EQ_USIZE;
+    x as usize
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ae473bcc5f..994075bef6 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,6 +31,7 @@ use utils::{
 
 use crate::l0_flush::L0FlushConfig;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -1020,6 +1021,15 @@ impl PageServerConf {
 
         conf.default_tenant_conf = t_conf.merge(TenantConf::default());
 
+        IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
+            .map_err(|msg| anyhow::anyhow!("{msg}"))
+            .with_context(|| {
+                format!(
+                    "effective checkpoint distance is unsupported: {}",
+                    conf.default_tenant_conf.checkpoint_distance
+                )
+            })?;
+
         Ok(conf)
     }
 
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index dbfc9f3544..7a9cf495c7 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -16,6 +16,7 @@ pub mod l0_flush;
 use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
+mod assert_u64_eq_usize;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0364d521b6..60ab242ffc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -877,6 +877,12 @@ impl Tenant {
                         });
                     };
 
+                // TODO: should also be rejecting tenant conf changes that violate this check.
+                if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
+                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                    return Ok(());
+                }
+
                 let mut init_order = init_order;
                 // take the completion because initial tenant loading will complete when all of
                 // these tasks complete.
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index a245c99a88..dd70f6bbff 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
 
 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
-const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
+pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
 
 pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
 pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
@@ -326,7 +326,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
             } else {
                 // Write a 4-byte length header
-                if len > MAX_SUPPORTED_LEN {
+                if len > MAX_SUPPORTED_BLOB_LEN {
                     return (
                         (
                             io_buf.slice_len(),
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 601b095155..3afa3a86b9 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,7 +2,6 @@
 //! Low-level Block-oriented I/O functions
 //!
 
-use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
@@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> {
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
     FileBlockReader(&'a FileBlockReader<'a>),
-    EphemeralFile(&'a EphemeralFile),
     Adapter(Adapter<&'a DeltaLayerInner>),
-    Slice(&'a [u8]),
     #[cfg(test)]
     TestDisk(&'a super::disk_btree::tests::TestDisk),
     #[cfg(test)]
@@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> {
         use BlockReaderRef::*;
         match self {
             FileBlockReader(r) => r.read_blk(blknum, ctx).await,
-            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
             Adapter(r) => r.read_blk(blknum, ctx).await,
-            Slice(s) => Self::read_blk_slice(s, blknum),
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
@@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> {
     }
 }
 
-impl<'a> BlockReaderRef<'a> {
-    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
-        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
-        let end = start.checked_add(PAGE_SZ).unwrap();
-        if end > slice.len() {
-            return Err(std::io::Error::new(
-                std::io::ErrorKind::UnexpectedEof,
-                format!("slice too short, len={} end={}", slice.len(), end),
-            ));
-        }
-        let slice = &slice[start..end];
-        let page_sized: &[u8; PAGE_SZ] = slice
-            .try_into()
-            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
-        Ok(BlockLease::Slice(page_sized))
-    }
-}
-
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 44f0fc7ab1..5324e1807d 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -1,13 +1,21 @@
 //! Implementation of append-only file data structure
 //! used to keep in-memory layers spilled on disk.
 
+use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
-use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
-use crate::virtual_file::{self, VirtualFile};
+use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
+use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
+use crate::virtual_file::owned_buffers_io::write::Buffer;
+use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
+use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
+use tokio_epoll_uring::{BoundedBuf, Slice};
+use tracing::error;
 
 use std::io;
 use std::sync::atomic::AtomicU64;
@@ -16,12 +24,17 @@ use utils::id::TimelineId;
 pub struct EphemeralFile {
     _tenant_shard_id: TenantShardId,
     _timeline_id: TimelineId,
-
-    rw: page_caching::RW,
+    page_cache_file_id: page_cache::FileId,
+    bytes_written: u64,
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        BytesMut,
+        size_tracking_writer::Writer<VirtualFile>,
+    >,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
+    _gate_guard: utils::sync::gate::GateGuard,
 }
 
-mod page_caching;
-mod zero_padded_read_write;
+const TAIL_SZ: usize = 64 * 1024;
 
 impl EphemeralFile {
     pub async fn create(
@@ -51,75 +64,178 @@ impl EphemeralFile {
         )
         .await?;
 
+        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
+
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, gate_guard),
+            page_cache_file_id,
+            bytes_written: 0,
+            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
+                size_tracking_writer::Writer::new(file),
+                BytesMut::with_capacity(TAIL_SZ),
+            ),
+            _gate_guard: gate_guard,
         })
     }
+}
 
+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // unlink the file
+        // we are clear to do this, because we have entered a gate
+        let path = &self.buffered_writer.as_inner().as_inner().path;
+        let res = std::fs::remove_file(path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!("could not remove ephemeral file '{path}': {e}");
+            }
+        }
+    }
+}
+
+impl EphemeralFile {
     pub(crate) fn len(&self) -> u64 {
-        self.rw.bytes_written()
+        self.bytes_written
     }
 
     pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.rw.page_cache_file_id()
+        self.page_cache_file_id
     }
 
-    /// See [`self::page_caching::RW::load_to_vec`].
     pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        self.rw.load_to_vec(ctx).await
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        self.rw.read_blk(blknum, ctx).await
-    }
-
-    #[cfg(test)]
-    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
-    pub(crate) async fn write_blob(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
-
-        let mut len_bytes = std::io::Cursor::new(Vec::new());
-        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
-            srcbuf.len(),
-            &mut len_bytes,
-        );
-        let len_bytes = len_bytes.into_inner();
-
-        // Write the length field
-        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
-
-        // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
-
-        Ok(pos)
+        let size = self.len().into_usize();
+        let vec = Vec::with_capacity(size);
+        let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
+        assert_eq!(nread, size);
+        let vec = slice.into_inner();
+        assert_eq!(vec.len(), nread);
+        assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
+        Ok(vec)
     }
 
     /// Returns the offset at which the first byte of the input was written, for use
     /// in constructing indices over the written value.
+    ///
+    /// Panics if the write is short because there's no way we can recover from that.
+    /// TODO: make upstack handle this as an error.
     pub(crate) async fn write_raw(
         &mut self,
         srcbuf: &[u8],
         ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+    ) -> std::io::Result<u64> {
+        let pos = self.bytes_written;
+
+        let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
+            std::io::Error::new(
+                std::io::ErrorKind::Other,
+                format!(
+                    "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
+                    srcbuf_len = srcbuf.len(),
+                ),
+            )
+        })?;
 
         // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+        let nwritten = self
+            .buffered_writer
+            .write_buffered_borrowed(srcbuf, ctx)
+            .await?;
+        assert_eq!(
+            nwritten,
+            srcbuf.len(),
+            "buffered writer has no short writes"
+        );
+
+        self.bytes_written = new_bytes_written;
 
         Ok(pos)
     }
 }
 
+impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
+    async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
+        &'b self,
+        start: u64,
+        dst: tokio_epoll_uring::Slice<B>,
+        ctx: &'a RequestContext,
+    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
+        let file_size_tracking_writer = self.buffered_writer.as_inner();
+        let flushed_offset = file_size_tracking_writer.bytes_written();
+
+        let buffer = self.buffered_writer.inspect_buffer();
+        let buffered = &buffer[0..buffer.pending()];
+
+        let dst_cap = dst.bytes_total().into_u64();
+        let end = {
+            // saturating_add is correct here because the max file size is u64::MAX, so,
+            // if start + dst.len() > u64::MAX, then we know it will be a short read
+            let mut end: u64 = start.saturating_add(dst_cap);
+            if end > self.bytes_written {
+                end = self.bytes_written;
+            }
+            end
+        };
+
+        // inclusive, exclusive
+        #[derive(Debug)]
+        struct Range<N>(N, N);
+        impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
+            fn len(&self) -> N {
+                if self.0 > self.1 {
+                    N::zero()
+                } else {
+                    self.1 - self.0
+                }
+            }
+        }
+        let written_range = Range(start, std::cmp::min(end, flushed_offset));
+        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
+
+        let dst = if written_range.len() > 0 {
+            let file: &VirtualFile = file_size_tracking_writer.as_inner();
+            let bounds = dst.bounds();
+            let slice = file
+                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
+                .await?;
+            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
+        } else {
+            dst
+        };
+
+        let dst = if buffered_range.len() > 0 {
+            let offset_in_buffer = buffered_range
+                .0
+                .checked_sub(flushed_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
+            let bounds = dst.bounds();
+            let mut view = dst.slice({
+                let start = written_range.len().into_usize();
+                let end = start
+                    .checked_add(buffered_range.len().into_usize())
+                    .unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+        } else {
+            dst
+        };
+
+        // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
+
+        Ok((dst, (end - start).into_usize()))
+    }
+}
+
 /// Does the given filename look like an ephemeral file?
 pub fn is_ephemeral_file(filename: &str) -> bool {
     if let Some(rest) = filename.strip_prefix("ephemeral-") {
@@ -129,19 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
     }
 }
 
-impl BlockReader for EphemeralFile {
-    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
-        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
-    }
-}
-
 #[cfg(test)]
 mod tests {
+    use rand::Rng;
+
     use super::*;
     use crate::context::DownloadBehavior;
     use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::BlockReaderRef;
-    use rand::{thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
@@ -172,69 +282,6 @@ mod tests {
         Ok((conf, tenant_shard_id, timeline_id, ctx))
     }
 
-    #[tokio::test]
-    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let entered = gate.enter().unwrap();
-
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
-
-        let pos_foo = file.write_blob(b"foo", &ctx).await?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor()
-                .read_blob(pos_foo, &ctx)
-                .await?
-                .as_slice()
-        );
-        let pos_bar = file.write_blob(b"bar", &ctx).await?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor()
-                .read_blob(pos_foo, &ctx)
-                .await?
-                .as_slice()
-        );
-        assert_eq!(
-            b"bar",
-            file.block_cursor()
-                .read_blob(pos_bar, &ctx)
-                .await?
-                .as_slice()
-        );
-
-        let mut blobs = Vec::new();
-        for i in 0..10000 {
-            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data, &ctx).await?;
-            blobs.push((pos, data));
-        }
-        // also test with a large blobs
-        for i in 0..100 {
-            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data, &ctx).await?;
-            blobs.push((pos, data));
-        }
-
-        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
-        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos, &ctx).await?;
-            assert_eq!(actual, expected);
-        }
-
-        // Test a large blob that spans multiple pages
-        let mut large_data = vec![0; 20000];
-        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data, &ctx).await?;
-        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
-        assert_eq!(result, large_data);
-
-        Ok(())
-    }
-
     #[tokio::test]
     async fn ephemeral_file_holds_gate_open() {
         const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
@@ -268,4 +315,151 @@ mod tests {
             .expect("closing completes right away")
             .expect("closing does not panic");
     }
+
+    #[tokio::test]
+    async fn test_ephemeral_file_basics() {
+        let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let mut file =
+            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+                .await
+                .unwrap();
+
+        let cap = file.buffered_writer.inspect_buffer().capacity();
+
+        let write_nbytes = cap + cap / 2;
+
+        let content: Vec<u8> = rand::thread_rng()
+            .sample_iter(rand::distributions::Standard)
+            .take(write_nbytes)
+            .collect();
+
+        let mut value_offsets = Vec::new();
+        for i in 0..write_nbytes {
+            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
+            value_offsets.push(off);
+        }
+
+        assert!(file.len() as usize == write_nbytes);
+        for i in 0..write_nbytes {
+            assert_eq!(value_offsets[i], i.into_u64());
+            let buf = Vec::with_capacity(1);
+            let (buf_slice, nread) = file
+                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
+                .await
+                .unwrap();
+            let buf = buf_slice.into_inner();
+            assert_eq!(nread, 1);
+            assert_eq!(&buf, &content[i..i + 1]);
+        }
+
+        let file_contents =
+            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
+        assert_eq!(file_contents, &content[0..cap]);
+
+        let buffer_contents = file.buffered_writer.inspect_buffer();
+        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
+    }
+
+    #[tokio::test]
+    async fn test_flushes_do_happen() {
+        let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let mut file =
+            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+                .await
+                .unwrap();
+
+        let cap = file.buffered_writer.inspect_buffer().capacity();
+
+        let content: Vec<u8> = rand::thread_rng()
+            .sample_iter(rand::distributions::Standard)
+            .take(cap + cap / 2)
+            .collect();
+
+        file.write_raw(&content, &ctx).await.unwrap();
+
+        // assert the state is as this test expects it to be
+        assert_eq!(
+            &file.load_to_vec(&ctx).await.unwrap(),
+            &content[0..cap + cap / 2]
+        );
+        let md = file
+            .buffered_writer
+            .as_inner()
+            .as_inner()
+            .path
+            .metadata()
+            .unwrap();
+        assert_eq!(
+            md.len(),
+            cap.into_u64(),
+            "buffered writer does one write if we write 1.5x buffer capacity"
+        );
+        assert_eq!(
+            &file.buffered_writer.inspect_buffer()[0..cap / 2],
+            &content[cap..cap + cap / 2]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_read_split_across_file_and_buffer() {
+        // This test exercises the logic on the read path that splits the logical read
+        // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
+        //
+        // This test build on the assertions in test_flushes_do_happen
+
+        let (conf, tenant_id, timeline_id, ctx) =
+            harness("test_read_split_across_file_and_buffer").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let mut file =
+            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+                .await
+                .unwrap();
+
+        let cap = file.buffered_writer.inspect_buffer().capacity();
+
+        let content: Vec<u8> = rand::thread_rng()
+            .sample_iter(rand::distributions::Standard)
+            .take(cap + cap / 2)
+            .collect();
+
+        file.write_raw(&content, &ctx).await.unwrap();
+
+        let test_read = |start: usize, len: usize| {
+            let file = &file;
+            let ctx = &ctx;
+            let content = &content;
+            async move {
+                let (buf, nread) = file
+                    .read_exact_at_eof_ok(
+                        start.into_u64(),
+                        Vec::with_capacity(len).slice_full(),
+                        ctx,
+                    )
+                    .await
+                    .unwrap();
+                assert_eq!(nread, len);
+                assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
+            }
+        };
+
+        // completely within the file range
+        assert!(20 < cap, "test assumption");
+        test_read(10, 10).await;
+        // border onto edge of file
+        test_read(cap - 10, 10).await;
+        // read across file and buffer
+        test_read(cap - 10, 20).await;
+        // stay from start of buffer
+        test_read(cap, 10).await;
+        // completely within buffer
+        test_read(cap + 10, 10).await;
+    }
 }
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
deleted file mode 100644
index 48926354f1..0000000000
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ /dev/null
@@ -1,153 +0,0 @@
-//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
-//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-//!
-//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
-
-use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
-use crate::virtual_file::VirtualFile;
-
-use std::io::{self};
-use tokio_epoll_uring::BoundedBuf;
-use tracing::*;
-
-use super::zero_padded_read_write;
-
-/// See module-level comment.
-pub struct RW {
-    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-impl RW {
-    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
-        let page_cache_file_id = page_cache::next_file_id();
-        Self {
-            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
-            _gate_guard,
-        }
-    }
-
-    pub fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
-    pub(crate) async fn write_all_borrowed(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<usize, io::Error> {
-        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
-        // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf, ctx).await
-    }
-
-    pub(crate) fn bytes_written(&self) -> u64 {
-        self.rw.bytes_written()
-    }
-
-    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
-    ///
-    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
-    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
-    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        // round up to the next PAGE_SZ multiple, required by blob_io
-        let size = {
-            let s = usize::try_from(self.bytes_written()).unwrap();
-            if s % PAGE_SZ == 0 {
-                s
-            } else {
-                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
-            }
-        };
-        let vec = Vec::with_capacity(size);
-
-        // read from disk what we've already flushed
-        let file_size_tracking_writer = self.rw.as_writer();
-        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
-        let mut vec = file_size_tracking_writer
-            .as_inner()
-            .read_exact_at(
-                vec.slice(0..(flushed_range.end - flushed_range.start)),
-                u64::try_from(flushed_range.start).unwrap(),
-                ctx,
-            )
-            .await?
-            .into_inner();
-
-        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
-        let buffered = self.rw.get_tail_zero_padded();
-        vec.extend_from_slice(buffered);
-        assert_eq!(vec.len(), size);
-        assert_eq!(vec.len() % PAGE_SZ, 0);
-        Ok(vec)
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        match self.rw.read_blk(blknum).await? {
-            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
-                let cache = page_cache::get();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.rw.as_writer().as_inner().path,
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(write_guard) => {
-                        let write_guard = writer
-                            .as_inner()
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
-                            .await?;
-                        let read_guard = write_guard.mark_valid();
-                        return Ok(BlockLease::PageReadGuard(read_guard));
-                    }
-                }
-            }
-            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
-                Ok(BlockLease::EphemeralFileMutableTail(buffer))
-            }
-        }
-    }
-}
-
-impl Drop for RW {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        // we are clear to do this, because we have entered a gate
-        let path = &self.rw.as_writer().as_inner().path;
-        let res = std::fs::remove_file(path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
-            }
-        }
-    }
-}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
deleted file mode 100644
index fe310acab8..0000000000
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ /dev/null
@@ -1,145 +0,0 @@
-//! The heart of how [`super::EphemeralFile`] does its reads and writes.
-//!
-//! # Writes
-//!
-//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
-//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
-//!
-//! # Reads
-//!
-//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
-//!
-//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
-//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
-//! if the read is for the prefix that has already been flushed.
-//!
-//! # Current Usage
-//!
-//! The current user of this module is [`super::page_caching::RW`].
-
-mod zero_padded;
-
-use crate::{
-    context::RequestContext,
-    page_cache::PAGE_SZ,
-    virtual_file::owned_buffers_io::{
-        self,
-        write::{Buffer, OwnedAsyncWriter},
-    },
-};
-
-const TAIL_SZ: usize = 64 * 1024;
-
-/// See module-level comment.
-pub struct RW<W: OwnedAsyncWriter> {
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        zero_padded::Buffer<TAIL_SZ>,
-        owned_buffers_io::util::size_tracking_writer::Writer<W>,
-    >,
-}
-
-pub enum ReadResult<'a, W> {
-    NeedsReadFromWriter { writer: &'a W },
-    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
-}
-
-impl<W> RW<W>
-where
-    W: OwnedAsyncWriter,
-{
-    pub fn new(writer: W) -> Self {
-        let bytes_flushed_tracker =
-            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
-        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
-            bytes_flushed_tracker,
-            zero_padded::Buffer::default(),
-        );
-        Self { buffered_writer }
-    }
-
-    pub(crate) fn as_writer(&self) -> &W {
-        self.buffered_writer.as_inner().as_inner()
-    }
-
-    pub async fn write_all_borrowed(
-        &mut self,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        flushed_offset + u64::try_from(buffer.pending()).unwrap()
-    }
-
-    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
-    pub fn get_tail_zero_padded(&self) -> &[u8] {
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffer_written_up_to = buffer.pending();
-        // pad to next page boundary
-        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
-            buffer_written_up_to
-        } else {
-            buffer_written_up_to
-                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
-                .unwrap()
-        };
-        &buffer.as_zero_padded_slice()[0..read_up_to]
-    }
-
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
-        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
-
-        // The trailing page ("block") might only be partially filled,
-        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
-        // Moreover, it has to be zero-padded, because when we still had
-        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
-        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
-        // => check here that the read doesn't go beyond this potentially trailing
-        // => the zero-padding is done in the `else` branch below
-        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
-            buffered_offset / (PAGE_SZ as u64)
-        } else {
-            (buffered_offset / (PAGE_SZ as u64)) + 1
-        };
-        if (blknum as u64) >= blocks_written {
-            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
-        }
-
-        // assertions for the `if-else` below
-        assert_eq!(
-            flushed_offset % (TAIL_SZ as u64), 0,
-            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
-        );
-        assert_eq!(
-            flushed_offset % (PAGE_SZ as u64),
-            0,
-            "the logic below can't handle if the page is spread across the flushed part and the buffer"
-        );
-
-        if read_offset < flushed_offset {
-            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
-            Ok(ReadResult::NeedsReadFromWriter {
-                writer: self.as_writer(),
-            })
-        } else {
-            let read_offset_in_buffer = read_offset
-                .checked_sub(flushed_offset)
-                .expect("would have taken `if` branch instead of this one");
-            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
-            let zero_padded_slice = buffer.as_zero_padded_slice();
-            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
-            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
-                buffer: page
-                    .try_into()
-                    .expect("the slice above got it as page-size slice"),
-            })
-        }
-    }
-}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
deleted file mode 100644
index 2dc0277638..0000000000
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ /dev/null
@@ -1,110 +0,0 @@
-//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
-//! unwritten range is guaranteed to be zero-initialized.
-//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
-//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
-
-use std::mem::MaybeUninit;
-
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
-
-/// See module-level comment.
-pub struct Buffer<const N: usize> {
-    allocation: Box<[u8; N]>,
-    written: usize,
-}
-
-impl<const N: usize> Default for Buffer<N> {
-    fn default() -> Self {
-        Self {
-            allocation: Box::new(
-                // SAFETY: zeroed memory is a valid [u8; N]
-                unsafe { MaybeUninit::zeroed().assume_init() },
-            ),
-            written: 0,
-        }
-    }
-}
-
-impl<const N: usize> Buffer<N> {
-    #[inline(always)]
-    fn invariants(&self) {
-        // don't check by default, unoptimized is too expensive even for debug mode
-        if false {
-            debug_assert!(self.written <= N, "{}", self.written);
-            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
-        }
-    }
-
-    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
-        &self.allocation
-    }
-}
-
-impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
-    type IoBuf = Self;
-
-    fn cap(&self) -> usize {
-        self.allocation.len()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        self.invariants();
-        let remaining = self.allocation.len() - self.written;
-        if other.len() > remaining {
-            panic!("calling extend_from_slice() with insufficient remaining capacity");
-        }
-        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
-        self.written += other.len();
-        self.invariants();
-    }
-
-    fn pending(&self) -> usize {
-        self.written
-    }
-
-    fn flush(self) -> FullSlice<Self> {
-        self.invariants();
-        let written = self.written;
-        FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
-    }
-
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
-        let Self {
-            mut allocation,
-            written,
-        } = iobuf;
-        allocation[0..written].fill(0);
-        let new = Self {
-            allocation,
-            written: 0,
-        };
-        new.invariants();
-        new
-    }
-}
-
-/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
-/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
-///
-/// Remember that bytes_init is generally _not_ a tracker of the amount
-/// of valid data in the io buffer; we use `Slice` for that.
-/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
-///
-/// SAFETY:
-///
-/// The [`Self::allocation`] is stable becauses boxes are stable.
-/// The memory is zero-initialized, so, bytes_init is always N.
-unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
-    fn stable_ptr(&self) -> *const u8 {
-        self.allocation.as_ptr()
-    }
-
-    fn bytes_init(&self) -> usize {
-        // Yes, N, not self.written; Read the full comment of this impl block!
-        N
-    }
-
-    fn bytes_total(&self) -> usize {
-        N
-    }
-}
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index c0508e13c0..00ef5b0afd 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -65,7 +65,7 @@ use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::IoBuf;
 use tracing::*;
 
 use utils::{
@@ -471,7 +471,7 @@ impl DeltaLayerWriterInner {
         ctx: &RequestContext,
     ) -> (FullSlice<Buf>, anyhow::Result<()>)
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBuf + Send,
     {
         assert!(
             self.lsn_range.start <= lsn,
@@ -678,7 +678,7 @@ impl DeltaLayerWriter {
         ctx: &RequestContext,
     ) -> (FullSlice<Buf>, anyhow::Result<()>)
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBuf + Send,
     {
         self.inner
             .as_mut()
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index a71b4dd83b..f31ab4b1e8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -4,23 +4,23 @@
 //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
 //! its position in the file, is kept in memory, though.
 //!
+use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, Context, Result};
+use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -39,6 +39,8 @@ use super::{
     DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
 };
 
+pub(crate) mod vectored_dio_read;
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
 
@@ -78,9 +80,9 @@ impl std::fmt::Debug for InMemoryLayer {
 
 pub struct InMemoryLayerInner {
     /// All versions of all pages in the layer are kept here. Indexed
-    /// by block number and LSN. The value is an offset into the
+    /// by block number and LSN. The [`IndexEntry`] is an offset into the
     /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
+    index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
 
     /// The values are stored in a serialized format in this file.
     /// Each serialized Value is preceded by a 'u32' length field.
@@ -90,6 +92,154 @@ pub struct InMemoryLayerInner {
     resource_units: GlobalResourceUnits,
 }
 
+/// Support the same max blob length as blob_io, because ultimately
+/// all the InMemoryLayer contents end up being written into a delta layer,
+/// using the [`crate::tenant::blob_io`].
+const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
+const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
+    let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
+    let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
+    assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
+    trailing_ones
+};
+
+/// See [`InMemoryLayerInner::index`].
+///
+/// For memory efficiency, the data is packed into a u64.
+///
+/// Layout:
+/// - 1 bit: `will_init`
+/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
+/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct IndexEntry(u64);
+
+impl IndexEntry {
+    /// See [`Self::MAX_SUPPORTED_POS`].
+    const MAX_SUPPORTED_POS_BITS: usize = {
+        let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
+        if remainder < 32 {
+            panic!("pos can be u32 as per type system, support that");
+        }
+        remainder
+    };
+    /// The maximum supported blob offset that can be represented by [`Self`].
+    /// See also [`Self::validate_checkpoint_distance`].
+    const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
+
+    // Layout
+    const WILL_INIT_RANGE: Range<usize> = 0..1;
+    const LEN_RANGE: Range<usize> =
+        Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
+    const POS_RANGE: Range<usize> =
+        Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
+    const _ASSERT: () = {
+        if Self::POS_RANGE.end != 64 {
+            panic!("we don't want undefined bits for our own sanity")
+        }
+    };
+
+    /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
+    ///
+    /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
+    /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
+    ///
+    /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
+    /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
+    ///
+    /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
+    /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
+    #[inline(always)]
+    fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
+        let IndexEntryNewArgs {
+            base_offset,
+            batch_offset,
+            len,
+            will_init,
+        } = arg;
+
+        let pos = base_offset
+            .checked_add(batch_offset)
+            .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
+
+        if pos.into_usize() > Self::MAX_SUPPORTED_POS {
+            anyhow::bail!(
+                "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
+                max = Self::MAX_SUPPORTED_POS
+            );
+        }
+
+        if len > MAX_SUPPORTED_BLOB_LEN {
+            anyhow::bail!(
+                "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
+            );
+        }
+
+        let mut data: u64 = 0;
+        use bit_field::BitField;
+        data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
+        data.set_bits(Self::LEN_RANGE, len.into_u64());
+        data.set_bits(Self::POS_RANGE, pos);
+
+        Ok(Self(data))
+    }
+
+    #[inline(always)]
+    fn unpack(&self) -> IndexEntryUnpacked {
+        use bit_field::BitField;
+        IndexEntryUnpacked {
+            will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
+            len: self.0.get_bits(Self::LEN_RANGE),
+            pos: self.0.get_bits(Self::POS_RANGE),
+        }
+    }
+
+    /// See [`Self::new`].
+    pub(crate) const fn validate_checkpoint_distance(
+        checkpoint_distance: u64,
+    ) -> Result<(), &'static str> {
+        if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
+            return Err("exceeds the maximum supported value");
+        }
+        let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
+        if res.is_none() {
+            return Err(
+                "checkpoint distance + max supported blob len overflows in-memory addition",
+            );
+        }
+
+        // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
+
+        Ok(())
+    }
+
+    const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
+        let res = Self::validate_checkpoint_distance(
+            crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
+        );
+        if res.is_err() {
+            panic!("default checkpoint distance is valid")
+        }
+    };
+}
+
+/// Args to [`IndexEntry::new`].
+#[derive(Clone, Copy)]
+struct IndexEntryNewArgs {
+    base_offset: u64,
+    batch_offset: u64,
+    len: usize,
+    will_init: bool,
+}
+
+/// Unpacked representation of the bitfielded [`IndexEntry`].
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+struct IndexEntryUnpacked {
+    will_init: bool,
+    len: u64,
+    pos: u64,
+}
+
 impl std::fmt::Debug for InMemoryLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("InMemoryLayerInner").finish()
@@ -276,7 +426,12 @@ impl InMemoryLayer {
             .build();
 
         let inner = self.inner.read().await;
-        let reader = inner.file.block_cursor();
+
+        struct ValueRead {
+            entry_lsn: Lsn,
+            read: vectored_dio_read::LogicalRead<Vec<u8>>,
+        }
+        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
 
         for range in keyspace.ranges.iter() {
             for (key, vec_map) in inner
@@ -291,24 +446,62 @@ impl InMemoryLayer {
 
                 let slice = vec_map.slice_range(lsn_range);
 
-                for (entry_lsn, pos) in slice.iter().rev() {
-                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-                    let buf = reader.read_blob(*pos, &ctx).await;
-                    if let Err(e) = buf {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                for (entry_lsn, index_entry) in slice.iter().rev() {
+                    let IndexEntryUnpacked {
+                        pos,
+                        len,
+                        will_init,
+                    } = index_entry.unpack();
+                    reads.entry(key).or_default().push(ValueRead {
+                        entry_lsn: *entry_lsn,
+                        read: vectored_dio_read::LogicalRead::new(
+                            pos,
+                            Vec::with_capacity(len as usize),
+                        ),
+                    });
+                    if will_init {
                         break;
                     }
+                }
+            }
+        }
 
-                    let value = Value::des(&buf.unwrap());
-                    if let Err(e) = value {
+        // Execute the reads.
+
+        let f = vectored_dio_read::execute(
+            &inner.file,
+            reads
+                .iter()
+                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+            &ctx,
+        );
+        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
+            .await;
+
+        // Process results into the reconstruct state
+        'next_key: for (key, value_reads) in reads {
+            for ValueRead { entry_lsn, read } in value_reads {
+                match read.into_result().expect("we run execute() above") {
+                    Err(e) => {
                         reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                        break;
+                        continue 'next_key;
                     }
+                    Ok(value_buf) => {
+                        let value = Value::des(&value_buf);
+                        if let Err(e) = value {
+                            reconstruct_state
+                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                            continue 'next_key;
+                        }
 
-                    let key_situation =
-                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
-                    if key_situation == ValueReconstructSituation::Complete {
-                        break;
+                        let key_situation =
+                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
+                        if key_situation == ValueReconstructSituation::Complete {
+                            // TODO: metric to see if we fetched more values than necessary
+                            continue 'next_key;
+                        }
+
+                        // process the next value in the next iteration of the loop
                     }
                 }
             }
@@ -324,8 +517,9 @@ impl InMemoryLayer {
 struct SerializedBatchOffset {
     key: CompactKey,
     lsn: Lsn,
-    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
-    offset: u64,
+    // TODO: separate type when we start serde-serializing this value, to avoid coupling
+    // in-memory representation to serialization format.
+    index_entry: IndexEntry,
 }
 
 pub struct SerializedBatch {
@@ -340,30 +534,10 @@ pub struct SerializedBatch {
 }
 
 impl SerializedBatch {
-    /// Write a blob length in the internal format of the EphemeralFile
-    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
-        use std::io::Write;
-
-        if len < 0x80 {
-            // short one-byte length header
-            let len_buf = [len as u8];
-
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        } else {
-            let mut len_buf = u32::to_be_bytes(len as u32);
-            len_buf[0] |= 0x80;
-            cursor
-                .write_all(&len_buf)
-                .expect("Writing to Vec is infallible");
-        }
-    }
-
-    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
         // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
         // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
-        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
         let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
 
         let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
@@ -371,14 +545,19 @@ impl SerializedBatch {
         for (key, lsn, val_ser_size, val) in batch {
             let relative_off = cursor.position();
 
-            Self::write_blob_length(val_ser_size, &mut cursor);
             val.ser_into(&mut cursor)
                 .expect("Writing into in-memory buffer is infallible");
 
             offsets.push(SerializedBatchOffset {
                 key,
                 lsn,
-                offset: relative_off,
+                index_entry: IndexEntry::new(IndexEntryNewArgs {
+                    base_offset: 0,
+                    batch_offset: relative_off,
+                    len: val_ser_size,
+                    will_init: val.will_init(),
+                })
+                .context("higher-level code ensures that values are within supported ranges")?,
             });
             max_lsn = std::cmp::max(max_lsn, lsn);
         }
@@ -388,11 +567,11 @@ impl SerializedBatch {
         // Assert that we didn't do any extra allocations while building buffer.
         debug_assert!(buffer.len() <= buffer_size);
 
-        Self {
+        Ok(Self {
             raw: buffer,
             offsets,
             max_lsn,
-        }
+        })
     }
 }
 
@@ -456,44 +635,69 @@ impl InMemoryLayer {
         })
     }
 
-    // Write path.
+    /// Write path.
+    ///
+    /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
+    /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
+    /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
     pub async fn put_batch(
         &self,
         serialized_batch: SerializedBatch,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
 
-        let base_off = {
-            inner
-                .file
-                .write_raw(
-                    &serialized_batch.raw,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::InMemoryLayer)
-                        .build(),
-                )
-                .await?
-        };
+        let base_offset = inner.file.len();
 
+        let SerializedBatch {
+            raw,
+            mut offsets,
+            max_lsn: _,
+        } = serialized_batch;
+
+        // Add the base_offset to the batch's index entries which are relative to the batch start.
+        for offset in &mut offsets {
+            let IndexEntryUnpacked {
+                will_init,
+                len,
+                pos,
+            } = offset.index_entry.unpack();
+            offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
+                base_offset,
+                batch_offset: pos,
+                len: len.into_usize(),
+                will_init,
+            })?;
+        }
+
+        // Write the batch to the file
+        inner.file.write_raw(&raw, ctx).await?;
+        let new_size = inner.file.len();
+        let expected_new_len = base_offset
+            .checked_add(raw.len().into_u64())
+            // write_raw would error if we were to overflow u64.
+            // also IndexEntry and higher levels in
+            //the code don't allow the file to grow that large
+            .unwrap();
+        assert_eq!(new_size, expected_new_len);
+
+        // Update the index with the new entries
         for SerializedBatchOffset {
             key,
             lsn,
-            offset: relative_off,
-        } in serialized_batch.offsets
+            index_entry,
+        } in offsets
         {
-            let off = base_off + relative_off;
             let vec_map = inner.index.entry(key).or_default();
-            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
             if old.is_some() {
                 // We already had an entry for this LSN. That's odd..
                 warn!("Key {} at {} already exists", key, lsn);
             }
         }
 
-        let size = inner.file.len();
-        inner.resource_units.maybe_publish_size(size);
+        inner.resource_units.maybe_publish_size(new_size);
 
         Ok(())
     }
@@ -537,7 +741,7 @@ impl InMemoryLayer {
         {
             let inner = self.inner.write().await;
             for vec_map in inner.index.values() {
-                for (lsn, _pos) in vec_map.as_slice() {
+                for (lsn, _) in vec_map.as_slice() {
                     assert!(*lsn < end_lsn);
                 }
             }
@@ -601,36 +805,23 @@ impl InMemoryLayer {
         match l0_flush_global_state {
             l0_flush::Inner::Direct { .. } => {
                 let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
-                assert_eq!(
-                    file_contents.len() % PAGE_SZ,
-                    0,
-                    "needed by BlockReaderRef::Slice"
-                );
-                assert_eq!(file_contents.len(), {
-                    let written = usize::try_from(inner.file.len()).unwrap();
-                    if written % PAGE_SZ == 0 {
-                        written
-                    } else {
-                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
-                    }
-                });
 
-                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
-
-                let mut buf = Vec::new();
+                let file_contents = Bytes::from(file_contents);
 
                 for (key, vec_map) in inner.index.iter() {
                     // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        // TODO: once we have blob lengths in the in-memory index, we can
-                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
-                        // 2. load the file contents into a Bytes and
-                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
-                        // 4. pass that `buf` into `put_value_bytes`
-                        // => https://github.com/neondatabase/neon/issues/8183
-                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
+                    for (lsn, entry) in vec_map
+                        .as_slice()
+                        .iter()
+                        .map(|(lsn, entry)| (lsn, entry.unpack()))
+                    {
+                        let IndexEntryUnpacked {
+                            pos,
+                            len,
+                            will_init,
+                        } = entry;
+                        let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
+                        let (_buf, res) = delta_layer_writer
                             .put_value_bytes(
                                 Key::from_compact(*key),
                                 *lsn,
@@ -640,7 +831,6 @@ impl InMemoryLayer {
                             )
                             .await;
                         res?;
-                        buf = tmp.into_raw_slice().into_inner();
                     }
                 }
             }
@@ -662,3 +852,134 @@ impl InMemoryLayer {
         Ok(Some((desc, path)))
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_index_entry() {
+        const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
+        use IndexEntryNewArgs as Args;
+        use IndexEntryUnpacked as Unpacked;
+
+        let roundtrip = |args, expect: Unpacked| {
+            let res = IndexEntry::new(args).expect("this tests expects no errors");
+            let IndexEntryUnpacked {
+                will_init,
+                len,
+                pos,
+            } = res.unpack();
+            assert_eq!(will_init, expect.will_init);
+            assert_eq!(len, expect.len);
+            assert_eq!(pos, expect.pos);
+        };
+
+        // basic roundtrip
+        for pos in [0, MAX_SUPPORTED_POS] {
+            for len in [0, MAX_SUPPORTED_BLOB_LEN] {
+                for will_init in [true, false] {
+                    let expect = Unpacked {
+                        will_init,
+                        len: len.into_u64(),
+                        pos: pos.into_u64(),
+                    };
+                    roundtrip(
+                        Args {
+                            will_init,
+                            base_offset: pos.into_u64(),
+                            batch_offset: 0,
+                            len,
+                        },
+                        expect,
+                    );
+                    roundtrip(
+                        Args {
+                            will_init,
+                            base_offset: 0,
+                            batch_offset: pos.into_u64(),
+                            len,
+                        },
+                        expect,
+                    );
+                }
+            }
+        }
+
+        // too-large len
+        let too_large = Args {
+            will_init: false,
+            len: MAX_SUPPORTED_BLOB_LEN + 1,
+            base_offset: 0,
+            batch_offset: 0,
+        };
+        assert!(IndexEntry::new(too_large).is_err());
+
+        // too-large pos
+        {
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
+                batch_offset: 0,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: 0,
+                batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+        }
+
+        // too large (base_offset + batch_offset)
+        {
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: MAX_SUPPORTED_POS.into_u64(),
+                batch_offset: 1,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+            let too_large = Args {
+                will_init: false,
+                len: 0,
+                base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
+                batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
+            };
+            assert!(IndexEntry::new(too_large).is_err());
+        }
+
+        // valid special cases
+        // - area past the max supported pos that is accessible by len
+        for len in [1, MAX_SUPPORTED_BLOB_LEN] {
+            roundtrip(
+                Args {
+                    will_init: false,
+                    len,
+                    base_offset: MAX_SUPPORTED_POS.into_u64(),
+                    batch_offset: 0,
+                },
+                Unpacked {
+                    will_init: false,
+                    len: len as u64,
+                    pos: MAX_SUPPORTED_POS.into_u64(),
+                },
+            );
+            roundtrip(
+                Args {
+                    will_init: false,
+                    len,
+                    base_offset: 0,
+                    batch_offset: MAX_SUPPORTED_POS.into_u64(),
+                },
+                Unpacked {
+                    will_init: false,
+                    len: len as u64,
+                    pos: MAX_SUPPORTED_POS.into_u64(),
+                },
+            );
+        }
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
new file mode 100644
index 0000000000..0683e15659
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -0,0 +1,937 @@
+use std::{
+    collections::BTreeMap,
+    sync::{Arc, RwLock},
+};
+
+use itertools::Itertools;
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+
+use crate::{
+    assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
+    context::RequestContext,
+};
+
+/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
+pub trait File: Send {
+    /// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())`
+    /// and return the number of bytes read (let's call it `nread`).
+    /// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes.
+    ///
+    /// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`)
+    /// is if the file is shorter than `start+dst.len()`.
+    ///
+    /// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an
+    /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
+    ///
+    /// No guarantees are made about the remaining bytes in `dst` in case of a short read.
+    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+        &'b self,
+        start: u64,
+        dst: Slice<B>,
+        ctx: &'a RequestContext,
+    ) -> std::io::Result<(Slice<B>, usize)>;
+}
+
+/// A logical read from [`File`]. See [`Self::new`].
+pub struct LogicalRead<B: Buffer> {
+    pos: u64,
+    state: RwLockRefCell<LogicalReadState<B>>,
+}
+
+enum LogicalReadState<B: Buffer> {
+    NotStarted(B),
+    Ongoing(B),
+    Ok(B),
+    Error(Arc<std::io::Error>),
+    Undefined,
+}
+
+impl<B: Buffer> LogicalRead<B> {
+    /// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`.
+    pub fn new(pos: u64, buf: B) -> Self {
+        Self {
+            pos,
+            state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)),
+        }
+    }
+    pub fn into_result(self) -> Option<Result<B, Arc<std::io::Error>>> {
+        match self.state.into_inner() {
+            LogicalReadState::Ok(buf) => Some(Ok(buf)),
+            LogicalReadState::Error(e) => Some(Err(e)),
+            LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None,
+            LogicalReadState::Undefined => unreachable!(),
+        }
+    }
+}
+
+/// The buffer into which a [`LogicalRead`] result is placed.
+pub trait Buffer: std::ops::Deref<Target = [u8]> {
+    /// Immutable.
+    fn cap(&self) -> usize;
+    /// Changes only through [`Self::extend_from_slice`].
+    fn len(&self) -> usize;
+    /// Panics if the total length would exceed the initialized capacity.
+    fn extend_from_slice(&mut self, src: &[u8]);
+}
+
+/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO.
+const DIO_CHUNK_SIZE: usize = 512;
+
+/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`.
+/// (The unit is the number of chunks.)
+const MAX_CHUNK_BATCH_SIZE: usize = {
+    let desired = 128 * 1024; // 128k
+    if desired % DIO_CHUNK_SIZE != 0 {
+        panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE")
+        // compile-time error
+    }
+    desired / DIO_CHUNK_SIZE
+};
+
+/// Execute the given logical `reads` against `file`.
+/// The results are placed in the buffers of the [`LogicalRead`]s.
+/// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`].
+///
+/// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function.
+/// Otherwise, this function panics.
+pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext)
+where
+    I: IntoIterator<Item = &'a LogicalRead<B>>,
+    F: File,
+    B: Buffer + IoBufMut + Send,
+{
+    // Terminology:
+    // logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity
+    // chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges
+    // interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests
+    // physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity
+
+    // Preserve a copy of the logical reads for debug assertions at the end
+    #[cfg(debug_assertions)]
+    let (reads, assert_logical_reads) = {
+        let (reads, assert) = reads.into_iter().tee();
+        (reads, Some(Vec::from_iter(assert)))
+    };
+    #[cfg(not(debug_assertions))]
+    let (reads, assert_logical_reads): (_, Option<Vec<&'a LogicalRead<B>>>) = (reads, None);
+
+    // Plan which parts of which chunks need to be appended to which buffer
+    let mut by_chunk: BTreeMap<u64, Vec<Interest<B>>> = BTreeMap::new();
+    struct Interest<'a, B: Buffer> {
+        logical_read: &'a LogicalRead<B>,
+        offset_in_chunk: u64,
+        len: u64,
+    }
+    for logical_read in reads {
+        let LogicalRead { pos, state } = logical_read;
+        let mut state = state.borrow_mut();
+
+        // transition from NotStarted to Ongoing
+        let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined);
+        let req_len = match cur {
+            LogicalReadState::NotStarted(buf) => {
+                if buf.len() != 0 {
+                    panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`");
+                }
+                // buf.cap() == 0 is ok
+
+                // transition into Ongoing state
+                let req_len = buf.cap();
+                *state = LogicalReadState::Ongoing(buf);
+                req_len
+            }
+            x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"),
+        };
+
+        // plan which chunks we need to read from
+        let mut remaining = req_len;
+        let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64());
+        let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE;
+        while remaining > 0 {
+            let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk);
+            by_chunk.entry(chunk_no).or_default().push(Interest {
+                logical_read,
+                offset_in_chunk: offset_in_chunk.into_u64(),
+                len: remaining_in_chunk.into_u64(),
+            });
+            offset_in_chunk = 0;
+            chunk_no += 1;
+            remaining -= remaining_in_chunk;
+        }
+    }
+
+    // At this point, we could iterate over by_chunk, in chunk order,
+    // read each chunk from disk, and fill the buffers.
+    // However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE
+    // so we issue fewer IOs = fewer roundtrips = lower overall latency.
+    struct PhysicalRead<'a, B: Buffer> {
+        start_chunk_no: u64,
+        nchunks: usize,
+        dsts: Vec<PhysicalInterest<'a, B>>,
+    }
+    struct PhysicalInterest<'a, B: Buffer> {
+        logical_read: &'a LogicalRead<B>,
+        offset_in_physical_read: u64,
+        len: u64,
+    }
+    let mut physical_reads: Vec<PhysicalRead<B>> = Vec::new();
+    let mut by_chunk = by_chunk.into_iter().peekable();
+    loop {
+        let mut last_chunk_no = None;
+        let to_merge: Vec<(u64, Vec<Interest<B>>)> = by_chunk
+            .peeking_take_while(|(chunk_no, _)| {
+                if let Some(last_chunk_no) = last_chunk_no {
+                    if *chunk_no != last_chunk_no + 1 {
+                        return false;
+                    }
+                }
+                last_chunk_no = Some(*chunk_no);
+                true
+            })
+            .take(MAX_CHUNK_BATCH_SIZE)
+            .collect(); // TODO: avoid this .collect()
+        let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else {
+            break;
+        };
+        let nchunks = to_merge.len();
+        let dsts = to_merge
+            .into_iter()
+            .enumerate()
+            .flat_map(|(i, (_, dsts))| {
+                dsts.into_iter().map(
+                    move |Interest {
+                              logical_read,
+                              offset_in_chunk,
+                              len,
+                          }| {
+                        PhysicalInterest {
+                            logical_read,
+                            offset_in_physical_read: i
+                                .checked_mul(DIO_CHUNK_SIZE)
+                                .unwrap()
+                                .into_u64()
+                                + offset_in_chunk,
+                            len,
+                        }
+                    },
+                )
+            })
+            .collect();
+        physical_reads.push(PhysicalRead {
+            start_chunk_no,
+            nchunks,
+            dsts,
+        });
+    }
+    drop(by_chunk);
+
+    // Execute physical reads and fill the logical read buffers
+    // TODO: pipelined reads; prefetch;
+    let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
+    for PhysicalRead {
+        start_chunk_no,
+        nchunks,
+        dsts,
+    } in physical_reads
+    {
+        let all_done = dsts
+            .iter()
+            .all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal());
+        if all_done {
+            continue;
+        }
+        let read_offset = start_chunk_no
+            .checked_mul(DIO_CHUNK_SIZE.into_u64())
+            .expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier");
+        let io_buf = get_io_buffer(nchunks).slice_full();
+        let req_len = io_buf.len();
+        let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await
+        {
+            Ok(t) => t,
+            Err(e) => {
+                let e = Arc::new(e);
+                for PhysicalInterest { logical_read, .. } in dsts {
+                    *logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e));
+                    // this will make later reads for the given LogicalRead short-circuit, see top of loop body
+                }
+                continue;
+            }
+        };
+        let io_buf = io_buf_slice.into_inner();
+        assert!(
+            nread <= io_buf.len(),
+            "the last chunk in the file can be a short read, so, no =="
+        );
+        let io_buf = &io_buf[..nread];
+        for PhysicalInterest {
+            logical_read,
+            offset_in_physical_read,
+            len,
+        } in dsts
+        {
+            let mut logical_read_state_borrow = logical_read.state.borrow_mut();
+            let logical_read_buf = match &mut *logical_read_state_borrow {
+                LogicalReadState::NotStarted(_) => {
+                    unreachable!("we transition it into Ongoing at function entry")
+                }
+                LogicalReadState::Ongoing(buf) => buf,
+                LogicalReadState::Ok(_) | LogicalReadState::Error(_) => {
+                    continue;
+                }
+                LogicalReadState::Undefined => unreachable!(),
+            };
+            let range_in_io_buf = std::ops::Range {
+                start: offset_in_physical_read as usize,
+                end: offset_in_physical_read as usize + len as usize,
+            };
+            assert!(range_in_io_buf.end >= range_in_io_buf.start);
+            if range_in_io_buf.end > nread {
+                let msg = format!(
+                    "physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}",
+                    &*logical_read_state_borrow
+                );
+                logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new(
+                    std::io::ErrorKind::UnexpectedEof,
+                    msg,
+                )));
+                continue;
+            }
+            let data = &io_buf[range_in_io_buf];
+
+            // Copy data from io buffer into the logical read buffer.
+            // (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.)
+            let pre = if cfg!(debug_assertions) {
+                Some((logical_read_buf.len(), logical_read_buf.cap()))
+            } else {
+                None
+            };
+            logical_read_buf.extend_from_slice(data);
+            let post = if cfg!(debug_assertions) {
+                Some((logical_read_buf.len(), logical_read_buf.cap()))
+            } else {
+                None
+            };
+            match (pre, post) {
+                (None, None) => {}
+                (Some(_), None) | (None, Some(_)) => unreachable!(),
+                (Some((pre_len, pre_cap)), Some((post_len, post_cap))) => {
+                    assert_eq!(pre_len + len as usize, post_len);
+                    assert_eq!(pre_cap, post_cap);
+                }
+            }
+
+            if logical_read_buf.len() == logical_read_buf.cap() {
+                logical_read_state_borrow.transition_to_terminal(Ok(()));
+            }
+        }
+    }
+
+    if let Some(assert_logical_reads) = assert_logical_reads {
+        for logical_read in assert_logical_reads {
+            assert!(logical_read.state.borrow().is_terminal());
+        }
+    }
+}
+
+impl<B: Buffer> LogicalReadState<B> {
+    fn is_terminal(&self) -> bool {
+        match self {
+            LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false,
+            LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true,
+            LogicalReadState::Undefined => unreachable!(),
+        }
+    }
+    fn transition_to_terminal(&mut self, err: std::io::Result<()>) {
+        let cur = std::mem::replace(self, LogicalReadState::Undefined);
+        let buf = match cur {
+            LogicalReadState::Ongoing(buf) => buf,
+            x => panic!("must only call in state Ongoing, got {x:?}"),
+        };
+        *self = match err {
+            Ok(()) => LogicalReadState::Ok(buf),
+            Err(e) => LogicalReadState::Error(Arc::new(e)),
+        };
+    }
+}
+
+impl<B: Buffer> std::fmt::Debug for LogicalReadState<B> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        #[derive(Debug)]
+        #[allow(unused)]
+        struct BufferDebug {
+            len: usize,
+            cap: usize,
+        }
+        impl<'a> From<&'a dyn Buffer> for BufferDebug {
+            fn from(buf: &'a dyn Buffer) -> Self {
+                Self {
+                    len: buf.len(),
+                    cap: buf.cap(),
+                }
+            }
+        }
+        match self {
+            LogicalReadState::NotStarted(b) => {
+                write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer))
+            }
+            LogicalReadState::Ongoing(b) => {
+                write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer))
+            }
+            LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)),
+            LogicalReadState::Error(e) => write!(f, "Error({:?})", e),
+            LogicalReadState::Undefined => write!(f, "Undefined"),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct RwLockRefCell<T>(RwLock<T>);
+impl<T> RwLockRefCell<T> {
+    fn new(value: T) -> Self {
+        Self(RwLock::new(value))
+    }
+    fn borrow(&self) -> impl std::ops::Deref<Target = T> + '_ {
+        self.0.try_read().unwrap()
+    }
+    fn borrow_mut(&self) -> impl std::ops::DerefMut<Target = T> + '_ {
+        self.0.try_write().unwrap()
+    }
+    fn into_inner(self) -> T {
+        self.0.into_inner().unwrap()
+    }
+}
+
+impl Buffer for Vec<u8> {
+    fn cap(&self) -> usize {
+        self.capacity()
+    }
+
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn extend_from_slice(&mut self, src: &[u8]) {
+        if self.len() + src.len() > self.cap() {
+            panic!("Buffer capacity exceeded");
+        }
+        Vec::extend_from_slice(self, src);
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::assertions_on_constants)]
+mod tests {
+    use rand::Rng;
+
+    use crate::{
+        context::DownloadBehavior, task_mgr::TaskKind,
+        virtual_file::owned_buffers_io::slice::SliceMutExt,
+    };
+
+    use super::*;
+    use std::{cell::RefCell, collections::VecDeque};
+
+    struct InMemoryFile {
+        content: Vec<u8>,
+    }
+
+    impl InMemoryFile {
+        fn new_random(len: usize) -> Self {
+            Self {
+                content: rand::thread_rng()
+                    .sample_iter(rand::distributions::Standard)
+                    .take(len)
+                    .collect(),
+            }
+        }
+        fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead {
+            let expected_result = if pos as usize + len > self.content.len() {
+                Err("InMemoryFile short read".to_string())
+            } else {
+                Ok(self.content[pos as usize..pos as usize + len].to_vec())
+            };
+            TestLogicalRead::new(pos, len, expected_result)
+        }
+    }
+
+    #[test]
+    fn test_in_memory_file() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let file = InMemoryFile::new_random(10);
+        let test_read = |pos, len| {
+            let buf = vec![0; len];
+            let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
+            use futures::FutureExt;
+            let (slice, nread) = fut
+                .now_or_never()
+                .expect("impl never awaits")
+                .expect("impl never errors");
+            let mut buf = slice.into_inner();
+            buf.truncate(nread);
+            buf
+        };
+        assert_eq!(test_read(0, 1), &file.content[0..1]);
+        assert_eq!(test_read(1, 2), &file.content[1..3]);
+        assert_eq!(test_read(9, 2), &file.content[9..]);
+        assert!(test_read(10, 2).is_empty());
+        assert!(test_read(11, 2).is_empty());
+    }
+
+    impl File for InMemoryFile {
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+            &'b self,
+            start: u64,
+            mut dst: Slice<B>,
+            _ctx: &'a RequestContext,
+        ) -> std::io::Result<(Slice<B>, usize)> {
+            let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
+            let nread = {
+                let req_len = dst_slice.len();
+                let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize));
+                if start as usize >= self.content.len() {
+                    0
+                } else {
+                    dst_slice[..len]
+                        .copy_from_slice(&self.content[start as usize..start as usize + len]);
+                    len
+                }
+            };
+            rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs
+            Ok((dst, nread))
+        }
+    }
+
+    #[derive(Clone)]
+    struct TestLogicalRead {
+        pos: u64,
+        len: usize,
+        expected_result: Result<Vec<u8>, String>,
+    }
+
+    impl TestLogicalRead {
+        fn new(pos: u64, len: usize, expected_result: Result<Vec<u8>, String>) -> Self {
+            Self {
+                pos,
+                len,
+                expected_result,
+            }
+        }
+        fn make_logical_read(&self) -> LogicalRead<Vec<u8>> {
+            LogicalRead::new(self.pos, Vec::with_capacity(self.len))
+        }
+    }
+
+    async fn execute_and_validate_test_logical_reads<I, F>(
+        file: &F,
+        test_logical_reads: I,
+        ctx: &RequestContext,
+    ) where
+        I: IntoIterator<Item = TestLogicalRead>,
+        F: File,
+    {
+        let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
+        let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::<Vec<_>>();
+        execute(file, logical_reads.iter(), ctx).await;
+        for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) {
+            let actual = logical_read.into_result().expect("we call execute()");
+            match (actual, test_logical_read.expected_result) {
+                (Ok(actual), Ok(expected)) if actual == expected => {}
+                (Err(actual), Err(expected)) => {
+                    assert_eq!(actual.to_string(), expected);
+                }
+                (actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_blackbox() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let cs = DIO_CHUNK_SIZE;
+        let cs_u64 = cs.into_u64();
+
+        let file = InMemoryFile::new_random(10 * cs);
+
+        let test_logical_reads = vec![
+            file.test_logical_read(0, 1),
+            // adjacent to logical_read0
+            file.test_logical_read(1, 2),
+            // gap
+            // spans adjacent chunks
+            file.test_logical_read(cs_u64 - 1, 2),
+            // gap
+            //  tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5
+            file.test_logical_read(3 * cs_u64 - 1, cs + 2),
+            // gap
+            file.test_logical_read(5 * cs_u64, 1),
+        ];
+        let num_test_logical_reads = test_logical_reads.len();
+        let test_logical_reads_perms = test_logical_reads
+            .into_iter()
+            .permutations(num_test_logical_reads);
+
+        // test all orderings of LogicalReads, the order shouldn't matter for the results
+        for test_logical_reads in test_logical_reads_perms {
+            execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
+        }
+    }
+
+    #[tokio::test]
+    #[should_panic]
+    async fn test_reusing_logical_reads_panics() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let file = InMemoryFile::new_random(DIO_CHUNK_SIZE);
+        let a = file.test_logical_read(23, 10);
+        let logical_reads = vec![a.make_logical_read()];
+        execute(&file, &logical_reads, &ctx).await;
+        // reuse pancis
+        execute(&file, &logical_reads, &ctx).await;
+    }
+
+    struct RecorderFile<'a> {
+        recorded: RefCell<Vec<RecordedRead>>,
+        file: &'a InMemoryFile,
+    }
+
+    struct RecordedRead {
+        pos: u64,
+        req_len: usize,
+        res: Vec<u8>,
+    }
+
+    impl<'a> RecorderFile<'a> {
+        fn new(file: &'a InMemoryFile) -> RecorderFile<'a> {
+            Self {
+                recorded: Default::default(),
+                file,
+            }
+        }
+    }
+
+    impl<'x> File for RecorderFile<'x> {
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+            &'b self,
+            start: u64,
+            dst: Slice<B>,
+            ctx: &'a RequestContext,
+        ) -> std::io::Result<(Slice<B>, usize)> {
+            let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?;
+            self.recorded.borrow_mut().push(RecordedRead {
+                pos: start,
+                req_len: dst.bytes_total(),
+                res: Vec::from(&dst[..nread]),
+            });
+            Ok((dst, nread))
+        }
+    }
+
+    #[tokio::test]
+    async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE);
+
+        let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10);
+        let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20);
+
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+        assert_eq!(recorded.len(), 1);
+        let RecordedRead { pos, req_len, .. } = &recorded[0];
+        assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
+        assert_eq!(*req_len, DIO_CHUNK_SIZE);
+    }
+
+    #[tokio::test]
+    async fn test_max_chunk_batch_size_is_respected() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
+
+        // read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE
+        assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption");
+        assert!(10 < DIO_CHUNK_SIZE, "test assumption");
+        let mut test_logical_reads = Vec::new();
+        for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 {
+            test_logical_reads
+                .push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1));
+        }
+
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+        assert_eq!(recorded.len(), 2);
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[0];
+            assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE);
+            assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
+        }
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[1];
+            assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE);
+            assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_batch_breaks_if_chunk_is_not_interesting() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption");
+        let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE);
+
+        let a = file.test_logical_read(0, 1); // chunk 0
+        let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2
+
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+
+        assert_eq!(recorded.len(), 2);
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[0];
+            assert_eq!(*pos, 0);
+            assert_eq!(*req_len, DIO_CHUNK_SIZE);
+        }
+        {
+            let RecordedRead { pos, req_len, .. } = &recorded[1];
+            assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64());
+            assert_eq!(*req_len, DIO_CHUNK_SIZE);
+        }
+    }
+
+    struct ExpectedRead {
+        expect_pos: u64,
+        expect_len: usize,
+        respond: Result<Vec<u8>, String>,
+    }
+
+    struct MockFile {
+        expected: RefCell<VecDeque<ExpectedRead>>,
+    }
+
+    impl Drop for MockFile {
+        fn drop(&mut self) {
+            assert!(
+                self.expected.borrow().is_empty(),
+                "expected reads not satisfied"
+            );
+        }
+    }
+
+    macro_rules! mock_file {
+        ($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{
+            MockFile {
+                expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead {
+                    expect_pos: $pos,
+                    expect_len: $len,
+                    respond: $respond,
+                }),*])),
+            }
+        }};
+    }
+
+    impl File for MockFile {
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+            &'b self,
+            start: u64,
+            mut dst: Slice<B>,
+            _ctx: &'a RequestContext,
+        ) -> std::io::Result<(Slice<B>, usize)> {
+            let ExpectedRead {
+                expect_pos,
+                expect_len,
+                respond,
+            } = self
+                .expected
+                .borrow_mut()
+                .pop_front()
+                .expect("unexpected read");
+            assert_eq!(start, expect_pos);
+            assert_eq!(dst.bytes_total(), expect_len);
+            match respond {
+                Ok(mocked_bytes) => {
+                    let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len());
+                    let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
+                    dst_slice[..len].copy_from_slice(&mocked_bytes[..len]);
+                    rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
+                    Ok((dst, len))
+                }
+                Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_mock_file() {
+        // Self-test to ensure the relevant features of mock file work as expected.
+
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let mock_file = mock_file! {
+            0    , 512 => Ok(vec![0; 512]),
+            512  , 512 => Ok(vec![1; 512]),
+            1024 , 512 => Ok(vec![2; 10]),
+            2048,  1024 => Err("foo".to_owned()),
+        };
+
+        let buf = Vec::with_capacity(512);
+        let (buf, nread) = mock_file
+            .read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
+            .await
+            .unwrap();
+        assert_eq!(nread, 512);
+        assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
+
+        let buf = Vec::with_capacity(512);
+        let (buf, nread) = mock_file
+            .read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
+            .await
+            .unwrap();
+        assert_eq!(nread, 512);
+        assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
+
+        let buf = Vec::with_capacity(512);
+        let (buf, nread) = mock_file
+            .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
+            .await
+            .unwrap();
+        assert_eq!(nread, 10);
+        assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
+
+        let buf = Vec::with_capacity(1024);
+        let err = mock_file
+            .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
+            .await
+            .err()
+            .unwrap();
+        assert_eq!(err.to_string(), "foo");
+    }
+
+    #[tokio::test]
+    async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let test_logical_reads = vec![
+            // read spanning two batches
+            TestLogicalRead::new(
+                DIO_CHUNK_SIZE.into_u64() / 2,
+                MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE,
+                Err("foo".to_owned()),
+            ),
+            // second read in failing chunk
+            TestLogicalRead::new(
+                (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10,
+                5,
+                Err("foo".to_owned()),
+            ),
+            // read unaffected
+            TestLogicalRead::new(
+                (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64()
+                    + 2 * DIO_CHUNK_SIZE.into_u64()
+                    + 10,
+                5,
+                Ok(vec![1; 5]),
+            ),
+        ];
+        let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
+        let test_logical_read_perms = tmp.permutations(test_logical_reads.len());
+
+        for test_logical_reads in test_logical_read_perms {
+            let file = mock_file!(
+                0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]),
+                (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()),
+                (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]),
+            );
+            execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
+        }
+    }
+
+    struct TestShortReadsSetup {
+        ctx: RequestContext,
+        file: InMemoryFile,
+        written: u64,
+    }
+    fn setup_short_chunk_read_tests() -> TestShortReadsSetup {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        assert!(DIO_CHUNK_SIZE > 20, "test assumption");
+        let written = (2 * DIO_CHUNK_SIZE - 10).into_u64();
+        let file = InMemoryFile::new_random(written as usize);
+        TestShortReadsSetup { ctx, file, written }
+    }
+
+    #[tokio::test]
+    async fn test_short_chunk_read_from_written_range() {
+        // Test what happens if there are logical reads
+        // that start within the last chunk, and
+        // the last chunk is not the full chunk length.
+        //
+        // The read should succeed despite the short chunk length.
+        let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
+
+        let a = file.test_logical_read(written - 10, 5);
+        let recorder = RecorderFile::new(&file);
+
+        execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await;
+
+        let recorded = recorder.recorded.borrow();
+        assert_eq!(recorded.len(), 1);
+        let RecordedRead { pos, req_len, res } = &recorded[0];
+        assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
+        assert_eq!(*req_len, DIO_CHUNK_SIZE);
+        assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
+    }
+
+    #[tokio::test]
+    async fn test_short_chunk_read_and_logical_read_from_unwritten_range() {
+        // Test what happens if there are logical reads
+        // that start within the last chunk, and
+        // the last chunk is not the full chunk length, and
+        // the logical reads end in the unwritten range.
+        //
+        // All should fail with UnexpectedEof and have the same IO pattern.
+        async fn the_impl(offset_delta: i64) {
+            let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
+
+            let offset = u64::try_from(
+                i64::try_from(written)
+                    .unwrap()
+                    .checked_add(offset_delta)
+                    .unwrap(),
+            )
+            .unwrap();
+            let a = file.test_logical_read(offset, 5);
+            let recorder = RecorderFile::new(&file);
+            let a_vr = a.make_logical_read();
+            execute(&recorder, vec![&a_vr], &ctx).await;
+
+            // validate the LogicalRead result
+            let a_res = a_vr.into_result().unwrap();
+            let a_err = a_res.unwrap_err();
+            assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof);
+
+            // validate the IO pattern
+            let recorded = recorder.recorded.borrow();
+            assert_eq!(recorded.len(), 1);
+            let RecordedRead { pos, req_len, res } = &recorded[0];
+            assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
+            assert_eq!(*req_len, DIO_CHUNK_SIZE);
+            assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
+        }
+
+        the_impl(-1).await; // start == length - 1
+        the_impl(0).await; // start == length
+        the_impl(1).await; // start == length + 1
+    }
+
+    // TODO: mixed: some valid, some UnexpectedEof
+
+    // TODO: same tests but with merges
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 098c196ee8..e1dd80fbf2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -69,7 +69,7 @@ use crate::{
         config::defaults::DEFAULT_PITR_INTERVAL,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
-        storage_layer::PersistentLayerDesc,
+        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
     },
     walredo,
 };
@@ -1907,6 +1907,8 @@ impl Timeline {
 
             true
         } else if projected_layer_size >= checkpoint_distance {
+            // NB: this check is relied upon by:
+            let _ = IndexEntry::validate_checkpoint_distance;
             info!(
                 "Will roll layer at {} with layer size {} due to layer size ({})",
                 projected_lsn, layer_size, projected_layer_size
@@ -5702,7 +5704,7 @@ impl<'a> TimelineWriter<'a> {
             return Ok(());
         }
 
-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
+        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
         let batch_max_lsn = serialized_batch.max_lsn;
         let buf_size: u64 = serialized_batch.raw.len() as u64;
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index f8f37b17e3..568cf62e56 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -78,6 +78,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
     #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered<S: IoBuf + Send>(
         &mut self,
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index 66b6185aaa..f6404d68ac 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -247,9 +247,10 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
 
     compaction_period_s = 10
 
+    checkpoint_distance = 1024**3
     tenant_conf = {
         # Large space + time thresholds: effectively disable these limits
-        "checkpoint_distance": f"{1024 ** 4}",
+        "checkpoint_distance": f"{checkpoint_distance}",
         "checkpoint_timeout": "3600s",
         "compaction_period": f"{compaction_period_s}s",
     }
@@ -269,7 +270,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     for tenant, timeline, last_flush_lsn in last_flush_lsns:
         http_client = env.pageserver.http_client()
         initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"])
-        total_bytes_ingested += last_flush_lsn - initdb_lsn
+        this_timeline_ingested = last_flush_lsn - initdb_lsn
+        assert (
+            this_timeline_ingested < checkpoint_distance * 0.8
+        ), "this test is supposed to fill InMemoryLayer"
+        total_bytes_ingested += this_timeline_ingested
 
     log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})")
     assert total_bytes_ingested > max_dirty_data

From acc075071dbb5f365f809fcf5372216e17adae6f Mon Sep 17 00:00:00 2001
From: Andrew Rudenko <me@prepor.dev>
Date: Wed, 28 Aug 2024 21:09:26 +0200
Subject: [PATCH 050/159] feat(compute_ctl): add periodic `lease lsn` requests
 for static computes (#7994)

Part of #7497

## Problem

Static computes pinned at some fix LSN could be created initially within
PITR interval but eventually go out it. To make sure that Static
computes are not affected by GC, we need to start using the LSN lease
API (introduced in #8084) in compute_ctl.

## Summary of changes

**compute_ctl**
- Spawn a thread for when a static compute starts to periodically ping
pageserver(s) to make LSN lease requests.
- Add `test_readonly_node_gc` to test if static compute can read all
pages without error.
  - (test will fail on main without the code change here)

**page_service**
- `wait_or_get_last_lsn` will now allow `request_lsn` less than
`latest_gc_cutoff_lsn` to proceed if there is a lease on `request_lsn`.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
---
 compute_tools/src/bin/compute_ctl.rs      |   3 +
 compute_tools/src/lib.rs                  |   1 +
 compute_tools/src/lsn_lease.rs            | 186 ++++++++++++++++++++++
 pageserver/src/page_service.rs            |  45 ++++--
 test_runner/regress/test_readonly_node.py | 114 ++++++++++++-
 5 files changed, 331 insertions(+), 18 deletions(-)
 create mode 100644 compute_tools/src/lsn_lease.rs

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 0ba2c1aeb4..9499a7186e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
+use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info, warn};
@@ -366,6 +367,8 @@ fn wait_spec(
         state.start_time = now;
     }
 
+    launch_lsn_lease_bg_task_for_static(&compute);
+
     Ok(WaitSpecResult {
         compute,
         http_port,
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 543d4462ed..c402d63305 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
+pub mod lsn_lease;
 mod migration;
 pub mod monitor;
 pub mod params;
diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs
new file mode 100644
index 0000000000..7e5917c55f
--- /dev/null
+++ b/compute_tools/src/lsn_lease.rs
@@ -0,0 +1,186 @@
+use anyhow::bail;
+use anyhow::Result;
+use postgres::{NoTls, SimpleQueryMessage};
+use std::time::SystemTime;
+use std::{str::FromStr, sync::Arc, thread, time::Duration};
+use utils::id::TenantId;
+use utils::id::TimelineId;
+
+use compute_api::spec::ComputeMode;
+use tracing::{info, warn};
+use utils::{
+    lsn::Lsn,
+    shard::{ShardCount, ShardNumber, TenantShardId},
+};
+
+use crate::compute::ComputeNode;
+
+/// Spawns a background thread to periodically renew LSN leases for static compute.
+/// Do nothing if the compute is not in static mode.
+pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
+    let (tenant_id, timeline_id, lsn) = {
+        let state = compute.state.lock().unwrap();
+        let spec = state.pspec.as_ref().expect("Spec must be set");
+        match spec.spec.mode {
+            ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
+            _ => return,
+        }
+    };
+    let compute = compute.clone();
+
+    let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
+    thread::spawn(move || {
+        let _entered = span.entered();
+        if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
+            // TODO: might need stronger error feedback than logging an warning.
+            warn!("Exited with error: {e}");
+        }
+    });
+}
+
+/// Renews lsn lease periodically so static compute are not affected by GC.
+fn lsn_lease_bg_task(
+    compute: Arc<ComputeNode>,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<()> {
+    loop {
+        let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
+        let valid_duration = valid_until
+            .duration_since(SystemTime::now())
+            .unwrap_or(Duration::ZERO);
+
+        // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
+        let sleep_duration = valid_duration
+            .saturating_sub(Duration::from_secs(60))
+            .max(valid_duration / 2);
+
+        info!(
+            "Succeeded, sleeping for {} seconds",
+            sleep_duration.as_secs()
+        );
+        thread::sleep(sleep_duration);
+    }
+}
+
+/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
+/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
+fn acquire_lsn_lease_with_retry(
+    compute: &Arc<ComputeNode>,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<SystemTime> {
+    let mut attempts = 0usize;
+    let mut retry_period_ms: f64 = 500.0;
+    const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
+
+    loop {
+        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
+        let configs = {
+            let state = compute.state.lock().unwrap();
+
+            let spec = state.pspec.as_ref().expect("spec must be set");
+
+            let conn_strings = spec.pageserver_connstr.split(',');
+
+            conn_strings
+                .map(|connstr| {
+                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
+                    if let Some(storage_auth_token) = &spec.storage_auth_token {
+                        info!("Got storage auth token from spec file");
+                        config.password(storage_auth_token.clone());
+                    } else {
+                        info!("Storage auth token not set");
+                    }
+                    config
+                })
+                .collect::<Vec<_>>()
+        };
+
+        let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
+        match result {
+            Ok(Some(res)) => {
+                return Ok(res);
+            }
+            Ok(None) => {
+                bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
+            }
+            Err(e) => {
+                warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
+
+                thread::sleep(Duration::from_millis(retry_period_ms as u64));
+                retry_period_ms *= 1.5;
+                retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
+            }
+        }
+        attempts += 1;
+    }
+}
+
+/// Tries to acquire an LSN lease through PS page_service API.
+fn try_acquire_lsn_lease(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+    configs: &[postgres::Config],
+) -> Result<Option<SystemTime>> {
+    fn get_valid_until(
+        config: &postgres::Config,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<Option<SystemTime>> {
+        let mut client = config.connect(NoTls)?;
+        let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
+        let res = client.simple_query(&cmd)?;
+        let msg = match res.first() {
+            Some(msg) => msg,
+            None => bail!("empty response"),
+        };
+        let row = match msg {
+            SimpleQueryMessage::Row(row) => row,
+            _ => bail!("error parsing lsn lease response"),
+        };
+
+        // Note: this will be None if a lease is explicitly not granted.
+        let valid_until_str = row.get("valid_until");
+
+        let valid_until = valid_until_str.map(|s| {
+            SystemTime::UNIX_EPOCH
+                .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
+                .expect("Time larger than max SystemTime could handle")
+        });
+        Ok(valid_until)
+    }
+
+    let shard_count = configs.len();
+
+    let valid_until = if shard_count > 1 {
+        configs
+            .iter()
+            .enumerate()
+            .map(|(shard_number, config)| {
+                let tenant_shard_id = TenantShardId {
+                    tenant_id,
+                    shard_count: ShardCount::new(shard_count as u8),
+                    shard_number: ShardNumber(shard_number as u8),
+                };
+                get_valid_until(config, tenant_shard_id, timeline_id, lsn)
+            })
+            .collect::<Result<Vec<Option<SystemTime>>>>()?
+            .into_iter()
+            .min()
+            .unwrap()
+    } else {
+        get_valid_until(
+            &configs[0],
+            TenantShardId::unsharded(tenant_id),
+            timeline_id,
+            lsn,
+        )?
+    };
+
+    Ok(valid_until)
+}
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index cb1ab70147..39c6a6fb74 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -753,16 +753,21 @@ impl PageServerHandler {
         }
 
         if request_lsn < **latest_gc_cutoff_lsn {
-            // Check explicitly for INVALID just to get a less scary error message if the
-            // request is obviously bogus
-            return Err(if request_lsn == Lsn::INVALID {
-                PageStreamError::BadRequest("invalid LSN(0) in request".into())
-            } else {
-                PageStreamError::BadRequest(format!(
+            let gc_info = &timeline.gc_info.read().unwrap();
+            if !gc_info.leases.contains_key(&request_lsn) {
+                // The requested LSN is below gc cutoff and is not guarded by a lease.
+
+                // Check explicitly for INVALID just to get a less scary error message if the
+                // request is obviously bogus
+                return Err(if request_lsn == Lsn::INVALID {
+                    PageStreamError::BadRequest("invalid LSN(0) in request".into())
+                } else {
+                    PageStreamError::BadRequest(format!(
                         "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
                         request_lsn, **latest_gc_cutoff_lsn
                     ).into())
-            });
+                });
+            }
         }
 
         // Wait for WAL up to 'not_modified_since' to arrive, if necessary
@@ -789,6 +794,8 @@ impl PageServerHandler {
         }
     }
 
+    /// Handles the lsn lease request.
+    /// If a lease cannot be obtained, the client will receive NULL.
     #[instrument(skip_all, fields(shard_id, %lsn))]
     async fn handle_make_lsn_lease<IO>(
         &mut self,
@@ -811,19 +818,25 @@ impl PageServerHandler {
             .await?;
         set_tracing_field_shard_id(&timeline);
 
-        let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
-        let valid_until = lease
-            .valid_until
-            .duration_since(SystemTime::UNIX_EPOCH)
-            .map_err(|e| QueryError::Other(e.into()))?;
+        let lease = timeline
+            .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
+            .inspect_err(|e| {
+                warn!("{e}");
+            })
+            .ok();
+        let valid_until_str = lease.map(|l| {
+            l.valid_until
+                .duration_since(SystemTime::UNIX_EPOCH)
+                .expect("valid_until is earlier than UNIX_EPOCH")
+                .as_millis()
+                .to_string()
+        });
+        let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
 
         pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
             b"valid_until",
         )]))?
-        .write_message_noflush(&BeMessage::DataRow(&[Some(
-            &valid_until.as_millis().to_be_bytes(),
-        )]))?
-        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        .write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
 
         Ok(())
     }
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index ba8b91e84d..368f60127e 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,7 +1,15 @@
+import time
+
 import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    last_flush_lsn_upload,
+    tenant_get_shards,
+)
 from fixtures.pageserver.utils import wait_for_last_record_lsn
 from fixtures.utils import query_scalar
 
@@ -17,7 +25,12 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_readonly_node", "empty")
     endpoint_main = env.endpoints.create_start("test_readonly_node")
 
-    env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*basebackup .* failed: invalid basebackup lsn.*",
+            ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected",
+        ]
+    )
 
     main_pg_conn = endpoint_main.connect()
     main_cur = main_pg_conn.cursor()
@@ -105,6 +118,103 @@ def test_readonly_node(neon_simple_env: NeonEnv):
         )
 
 
+def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
+    """
+    Test static endpoint is protected from GC by acquiring and renewing lsn leases.
+    """
+
+    neon_env_builder.num_pageservers = 2
+    # GC is manual triggered.
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": f"{128 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{128 * 1024}",
+            # no PITR horizon, we specify the horizon when we request on-demand GC
+            "pitr_interval": "0s",
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # create image layers eagerly, so that GC can remove some layers
+            "image_creation_threshold": "1",
+            "image_layer_creation_check_threshold": "0",
+            # Short lease length to fit test.
+            "lsn_lease_length": "3s",
+        },
+        initial_tenant_shard_count=2,
+    )
+
+    ROW_COUNT = 500
+
+    def generate_updates_on_main(
+        env: NeonEnv,
+        ep_main: Endpoint,
+        data: int,
+        start=1,
+        end=ROW_COUNT,
+    ) -> Lsn:
+        """
+        Generates some load on main branch that results in some uploads.
+        """
+        with ep_main.cursor() as cur:
+            cur.execute(
+                f"INSERT INTO t0 (v0, v1) SELECT g, '{data}' FROM generate_series({start}, {end}) g ON CONFLICT (v0) DO UPDATE SET v1 = EXCLUDED.v1"
+            )
+            cur.execute("VACUUM t0")
+            last_flush_lsn = last_flush_lsn_upload(
+                env, ep_main, env.initial_tenant, env.initial_timeline
+            )
+        return last_flush_lsn
+
+    # Insert some records on main branch
+    with env.endpoints.create_start("main") as ep_main:
+        with ep_main.cursor() as cur:
+            cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
+        lsn = None
+        for i in range(2):
+            lsn = generate_updates_on_main(env, ep_main, i)
+
+        with env.endpoints.create_start(
+            branch_name="main",
+            endpoint_id="static",
+            lsn=lsn,
+        ) as ep_static:
+            with ep_static.cursor() as cur:
+                cur.execute("SELECT count(*) FROM t0")
+                assert cur.fetchone() == (ROW_COUNT,)
+
+            time.sleep(3)
+
+            generate_updates_on_main(env, ep_main, i, end=100)
+
+            # Trigger GC
+            for shard, ps in tenant_get_shards(env, env.initial_tenant):
+                client = ps.http_client()
+                gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+                log.info(f"{gc_result=}")
+
+                assert (
+                    gc_result["layers_removed"] == 0
+                ), "No layers should be removed, old layers are guarded by leases."
+
+            with ep_static.cursor() as cur:
+                cur.execute("SELECT count(*) FROM t0")
+                assert cur.fetchone() == (ROW_COUNT,)
+
+        # Do some update so we can increment latest_gc_cutoff
+        generate_updates_on_main(env, ep_main, i, end=100)
+
+    # Now trigger GC again, layers should be removed.
+    time.sleep(4)
+    for shard, ps in tenant_get_shards(env, env.initial_tenant):
+        client = ps.http_client()
+        gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+        log.info(f"{gc_result=}")
+
+        assert gc_result["layers_removed"] > 0, "Old layers should be removed after leases expired."
+
+
 # Similar test, but with more data, and we force checkpoints
 def test_timetravel(neon_simple_env: NeonEnv):
     env = neon_simple_env

From cfa45ff5eee33a46f54ab2571fddf5e47925f363 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 29 Aug 2024 07:45:33 +0300
Subject: [PATCH 051/159] Undo walloging replorgin file on checkpoint (#8794)

## Problem

See #8620

## Summary of changes

Remove walloping of replorigin file because it is reconstructed by PS

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b6910406e2..48388a5b59 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b6910406e2d05a2c94baa2e530ec882733047759
+Subproject commit 48388a5b597c81c09e28c016650a7156b48717a1
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 76063bff63..8aa1ded772 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 76063bff638ccce7afa99fc9037ac51338b9823d
+Subproject commit 8aa1ded7726d416ac8e02600aad387a353478fc7
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 8efa089aa7..95132feffe 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 8efa089aa7786381543a4f9efc69b92d43eab8c0
+Subproject commit 95132feffe277ce84309d93a42e9aadfd2cb0437
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 50cc99c2f1..319e648488 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.4",
-    "8efa089aa7786381543a4f9efc69b92d43eab8c0"
+    "95132feffe277ce84309d93a42e9aadfd2cb0437"
   ],
   "v15": [
     "15.8",
-    "76063bff638ccce7afa99fc9037ac51338b9823d"
+    "8aa1ded7726d416ac8e02600aad387a353478fc7"
   ],
   "v14": [
     "14.13",
-    "b6910406e2d05a2c94baa2e530ec882733047759"
+    "48388a5b597c81c09e28c016650a7156b48717a1"
   ]
 }

From c2f8fdccd79b89e14dcef072d6169691f8d49f5a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Aug 2024 13:06:00 +0200
Subject: [PATCH 052/159] ingest: rate-limited warning if WAL commit timestamps
 lags for > wait_lsn_timeout (#8839)

refs https://github.com/neondatabase/cloud/issues/13750

The logging in this commit will make it easier to detect lagging ingest.

We're trusting compute timestamps --- ideally we'd use SK timestmaps
instead.
But trusting the compute timestamp is ok for now.
---
 libs/postgres_ffi/src/lib.rs              |  2 +-
 libs/postgres_ffi/src/xlog_utils.rs       | 14 ++---
 libs/utils/src/rate_limit.rs              | 18 ++++++-
 pageserver/src/http/routes.rs             |  5 +-
 pageserver/src/tenant/timeline.rs         |  2 +-
 pageserver/src/walingest.rs               | 66 +++++++++++++++++++++++
 test_runner/regress/test_compatibility.py |  8 +++
 test_runner/regress/test_wal_receiver.py  |  6 +++
 8 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0940ad207f..9acb105e9b 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -136,9 +136,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
 
 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
-pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
+pub use v14::xlog_utils::try_from_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;
 
 pub use v14::bindings::DBState_DB_SHUTDOWNED;
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 9fe7e8198b..0cfd56962e 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -135,6 +135,8 @@ pub fn get_current_timestamp() -> TimestampTz {
 mod timestamp_conversions {
     use std::time::Duration;
 
+    use anyhow::Context;
+
     use super::*;
 
     const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
@@ -154,18 +156,18 @@ mod timestamp_conversions {
         }
     }
 
-    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
+    pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
         let time: u64 = time
             .try_into()
-            .expect("timestamp before millenium (postgres epoch)");
+            .context("timestamp before millenium (postgres epoch)")?;
         let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
         SystemTime::UNIX_EPOCH
             .checked_add(Duration::from_micros(since_unix_epoch))
-            .expect("SystemTime overflow")
+            .context("SystemTime overflow")
     }
 }
 
-pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
+pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
 
 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
@@ -545,14 +547,14 @@ mod tests {
     #[test]
     fn test_ts_conversion() {
         let now = SystemTime::now();
-        let round_trip = from_pg_timestamp(to_pg_timestamp(now));
+        let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
 
         let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
         let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
         assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
 
         let now_pg = get_current_timestamp();
-        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
+        let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
 
         assert_eq!(now_pg, round_trip_pg);
     }
diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs
index 557955bb88..f3f8f219e3 100644
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -5,6 +5,15 @@ use std::time::{Duration, Instant};
 pub struct RateLimit {
     last: Option<Instant>,
     interval: Duration,
+    dropped: u64,
+}
+
+pub struct RateLimitStats(u64);
+
+impl std::fmt::Display for RateLimitStats {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{} dropped calls", self.0)
+    }
 }
 
 impl RateLimit {
@@ -12,20 +21,27 @@ impl RateLimit {
         Self {
             last: None,
             interval,
+            dropped: 0,
         }
     }
 
     /// Call `f` if the rate limit allows.
     /// Don't call it otherwise.
     pub fn call<F: FnOnce()>(&mut self, f: F) {
+        self.call2(|_| f())
+    }
+
+    pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
         let now = Instant::now();
         match self.last {
             Some(last) if now - last <= self.interval => {
                 // ratelimit
+                self.dropped += 1;
             }
             _ => {
                 self.last = Some(now);
-                f();
+                f(RateLimitStats(self.dropped));
+                self.dropped = 0;
             }
         }
     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a126136d20..cb7c2b60ef 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -871,7 +871,10 @@ async fn get_timestamp_of_lsn_handler(
 
     match result {
         Some(time) => {
-            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
+            let time = format_rfc3339(
+                postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
+            )
+            .to_string();
             json_response(StatusCode::OK, time)
         }
         None => Err(ApiError::NotFound(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e1dd80fbf2..8096a0d18c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -218,7 +218,7 @@ pub(crate) struct RelSizeCache {
 }
 
 pub struct Timeline {
-    conf: &'static PageServerConf,
+    pub(crate) conf: &'static PageServerConf,
     tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
 
     myself: Weak<Self>,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8425528740..8ccd20adb1 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,19 +21,25 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use std::time::Duration;
+use std::time::SystemTime;
+
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
+use postgres_ffi::TimestampTz;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 use utils::failpoint_support;
+use utils::rate_limit::RateLimit;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::{DatadirModification, Version};
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
@@ -53,6 +59,13 @@ pub struct WalIngest {
     shard: ShardIdentity,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
+    warn_ingest_lag: WarnIngestLag,
+}
+
+struct WarnIngestLag {
+    lag_msg_ratelimit: RateLimit,
+    future_lsn_msg_ratelimit: RateLimit,
+    timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
 impl WalIngest {
@@ -71,6 +84,11 @@ impl WalIngest {
             shard: *timeline.get_shard_identity(),
             checkpoint,
             checkpoint_modified: false,
+            warn_ingest_lag: WarnIngestLag {
+                lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
+                future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
+                timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
+            },
         })
     }
 
@@ -1212,6 +1230,48 @@ impl WalIngest {
         Ok(())
     }
 
+    fn warn_on_ingest_lag(
+        &mut self,
+        conf: &crate::config::PageServerConf,
+        wal_timestmap: TimestampTz,
+    ) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let now = SystemTime::now();
+        let rate_limits = &mut self.warn_ingest_lag;
+        match try_from_pg_timestamp(wal_timestmap) {
+            Ok(ts) => {
+                match now.duration_since(ts) {
+                    Ok(lag) => {
+                        if lag > conf.wait_lsn_timeout {
+                            rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
+                                let lag = humantime::format_duration(lag);
+                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
+                            })
+                        }
+                    },
+                    Err(e) => {
+                        let delta_t = e.duration();
+                        // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
+                        // => https://www.robustperception.io/time-metric-from-the-node-exporter/
+                        const IGNORED_DRIFT: Duration = Duration::from_millis(100);
+                        if delta_t > IGNORED_DRIFT {
+                            let delta_t = humantime::format_duration(delta_t);
+                            rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| {
+                                warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future");
+                            })
+                        }
+                    }
+                };
+
+            }
+            Err(error) => {
+                rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
+                    warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries");
+                })
+            }
+        }
+    }
+
     /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
     ///
     async fn ingest_xact_record(
@@ -1228,6 +1288,8 @@ impl WalIngest {
         let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
         let mut page_xids: Vec<TransactionId> = vec![parsed.xid];
 
+        self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time);
+
         for subxact in &parsed.subxacts {
             let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
             if subxact_pageno != pageno {
@@ -2303,6 +2365,9 @@ mod tests {
         let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
         let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
+        let span = harness
+            .span()
+            .in_scope(|| info_span!("timeline_span", timeline_id=%TIMELINE_ID));
         let (tenant, ctx) = harness.load().await;
 
         let remote_initdb_path =
@@ -2354,6 +2419,7 @@ mod tests {
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                    .instrument(span.clone())
                     .await
                     .unwrap();
             }
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index c361efe90a..cd3f405a86 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -173,6 +173,11 @@ def test_backward_compatibility(
     try:
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
+        # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
+        ingest_lag_log_line = (
+            ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
+        )
+        env.pageserver.allowed_errors.append(ingest_lag_log_line)
         neon_env_builder.start()
 
         check_neon_works(
@@ -181,6 +186,9 @@ def test_backward_compatibility(
             sql_dump_path=compatibility_snapshot_dir / "dump.sql",
             repo_dir=env.repo_dir,
         )
+
+        env.pageserver.assert_log_contains(ingest_lag_log_line)
+
     except Exception:
         if breaking_changes_allowed:
             pytest.xfail(
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 6582b34218..229d3efd8e 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -62,6 +62,12 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
     elements_to_insert = 1_000_000
     expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
     env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
+    # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
+    # => after we run into a timeout and reconnect to a different SK, more time than wait_lsn_timeout has passed
+    # ==> we log this error
+    env.pageserver.allowed_errors.append(
+        ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
+    )
 
     insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
 

From a644f01b6af2d414f877a78bddb928f0b033762d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 29 Aug 2024 12:26:52 +0100
Subject: [PATCH 053/159] proxy+pageserver: shared leaky bucket impl (#8539)

In proxy I switched to a leaky-bucket impl using the GCRA algorithm. I
figured I could share the code with pageserver and remove the
leaky_bucket crate dependency with some very basic tokio timers and
queues for fairness.

The underlying algorithm should be fairly clear how it works from the
comments I have left in the code.

---

In benchmarking pageserver, @problame found that the new implementation
fixes a getpage throughput discontinuity in pageserver under the
`pagebench get-page-latest-lsn` benchmark with the clickbench dataset
(`test_perf_olap.py`).
The discontinuity is that for any of `--num-clients={2,3,4}`, getpage
throughput remains 10k.
With `--num-clients=5` and greater, getpage throughput then jumps to the
configured 20k rate limit.
With the changes in this PR, the discontinuity is gone, and we scale
throughput linearly to `--num-clients` until the configured rate limit.

More context in
https://github.com/neondatabase/cloud/issues/16886#issuecomment-2315257641.

closes https://github.com/neondatabase/cloud/issues/16886

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Cargo.lock                                    |  13 -
 Cargo.toml                                    |   1 -
 libs/pageserver_api/src/models.rs             |  12 +-
 libs/utils/Cargo.toml                         |   1 -
 libs/utils/src/leaky_bucket.rs                | 280 ++++++++++++++++++
 libs/utils/src/lib.rs                         |   1 +
 pageserver/Cargo.toml                         |   1 -
 pageserver/src/tenant/throttle.rs             |  47 ++-
 proxy/src/rate_limiter.rs                     |   4 +-
 proxy/src/rate_limiter/leaky_bucket.rs        |  92 ++----
 .../regress/test_attach_tenant_config.py      |   1 -
 .../test_pageserver_getpage_throttle.py       |  56 ++++
 12 files changed, 395 insertions(+), 114 deletions(-)
 create mode 100644 libs/utils/src/leaky_bucket.rs

diff --git a/Cargo.lock b/Cargo.lock
index c514625518..0c246bd258 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2950,17 +2950,6 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
-[[package]]
-name = "leaky-bucket"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
-dependencies = [
- "parking_lot 0.12.1",
- "tokio",
- "tracing",
-]
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3714,7 +3703,6 @@ dependencies = [
  "humantime-serde",
  "hyper 0.14.26",
  "itertools 0.10.5",
- "leaky-bucket",
  "md5",
  "metrics",
  "nix 0.27.1",
@@ -6983,7 +6971,6 @@ dependencies = [
  "humantime",
  "hyper 0.14.26",
  "jsonwebtoken",
- "leaky-bucket",
  "metrics",
  "nix 0.27.1",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index 7bd9a26394..fa949f9757 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -108,7 +108,6 @@ ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
 lasso = "0.7"
-leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d39ac75707..1d896863df 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,7 +7,7 @@ pub use utilization::PageserverUtilization;
 use std::{
     collections::HashMap,
     io::{BufRead, Read},
-    num::{NonZeroU64, NonZeroUsize},
+    num::{NonZeroU32, NonZeroU64, NonZeroUsize},
     str::FromStr,
     sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
@@ -486,12 +486,11 @@ pub struct EvictionPolicyLayerAccessThreshold {
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ThrottleConfig {
     pub task_kinds: Vec<String>, // TaskKind
-    pub initial: usize,
+    pub initial: u32,
     #[serde(with = "humantime_serde")]
     pub refill_interval: Duration,
-    pub refill_amount: NonZeroUsize,
-    pub max: usize,
-    pub fair: bool,
+    pub refill_amount: NonZeroU32,
+    pub max: u32,
 }
 
 impl ThrottleConfig {
@@ -501,9 +500,8 @@ impl ThrottleConfig {
             // other values don't matter with emtpy `task_kinds`.
             initial: 0,
             refill_interval: Duration::from_millis(1),
-            refill_amount: NonZeroUsize::new(1).unwrap(),
+            refill_amount: NonZeroU32::new(1).unwrap(),
             max: 1,
-            fair: true,
         }
     }
     /// The requests per second allowed  by the given config.
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 6e593eeac1..777fb95ece 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,7 +26,6 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
-leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs
new file mode 100644
index 0000000000..a120dc0ac5
--- /dev/null
+++ b/libs/utils/src/leaky_bucket.rs
@@ -0,0 +1,280 @@
+//! This module implements the Generic Cell Rate Algorithm for a simplified
+//! version of the Leaky Bucket rate limiting system.
+//!
+//! # Leaky Bucket
+//!
+//! If the bucket is full, no new requests are allowed and are throttled/errored.
+//! If the bucket is partially full/empty, new requests are added to the bucket in
+//! terms of "tokens".
+//!
+//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
+//!
+//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
+//!
+//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
+//!
+//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
+//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
+//!
+//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
+//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
+//!
+//! Another explaination can be found here: <https://brandur.org/rate-limiting>
+
+use std::{sync::Mutex, time::Duration};
+
+use tokio::{sync::Notify, time::Instant};
+
+pub struct LeakyBucketConfig {
+    /// This is the "time cost" of a single request unit.
+    /// Should loosely represent how long it takes to handle a request unit in active resource time.
+    /// Loosely speaking this is the inverse of the steady-rate requests-per-second
+    pub cost: Duration,
+
+    /// total size of the bucket
+    pub bucket_width: Duration,
+}
+
+impl LeakyBucketConfig {
+    pub fn new(rps: f64, bucket_size: f64) -> Self {
+        let cost = Duration::from_secs_f64(rps.recip());
+        let bucket_width = cost.mul_f64(bucket_size);
+        Self { cost, bucket_width }
+    }
+}
+
+pub struct LeakyBucketState {
+    /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
+    ///
+    /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
+    /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
+    /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
+    /// Draining the bucket will happen naturally as `now` moves forward.
+    ///
+    /// Let `n` be some "time cost" for the request,
+    /// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
+    /// If now is within the `bucket window + n`, we are within time budget.
+    /// If now is before the `bucket window + n`, we have run out of budget.
+    ///
+    /// This is inspired by the generic cell rate algorithm (GCRA) and works
+    /// exactly the same as a leaky-bucket.
+    pub empty_at: Instant,
+}
+
+impl LeakyBucketState {
+    pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
+        LeakyBucketState {
+            empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
+        }
+    }
+
+    pub fn bucket_is_empty(&self, now: Instant) -> bool {
+        // if self.end is after now, the bucket is not empty
+        self.empty_at <= now
+    }
+
+    /// Immediately adds tokens to the bucket, if there is space.
+    ///
+    /// In a scenario where you are waiting for available rate,
+    /// rather than just erroring immediately, `started` corresponds to when this waiting started.
+    ///
+    /// `n` is the number of tokens that will be filled in the bucket.
+    ///
+    /// # Errors
+    ///
+    /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
+    /// there will be space again.
+    pub fn add_tokens(
+        &mut self,
+        config: &LeakyBucketConfig,
+        started: Instant,
+        n: f64,
+    ) -> Result<(), Instant> {
+        let now = Instant::now();
+
+        // invariant: started <= now
+        debug_assert!(started <= now);
+
+        // If the bucket was empty when we started our search,
+        // we should update the `empty_at` value accordingly.
+        // this prevents us from having negative tokens in the bucket.
+        let mut empty_at = self.empty_at;
+        if empty_at < started {
+            empty_at = started;
+        }
+
+        let n = config.cost.mul_f64(n);
+        let new_empty_at = empty_at + n;
+        let allow_at = new_empty_at.checked_sub(config.bucket_width);
+
+        //                     empty_at
+        //          allow_at    |   new_empty_at
+        //           /          |   /
+        // -------o-[---------o-|--]---------
+        //   now1 ^      now2 ^
+        //
+        // at now1, the bucket would be completely filled if we add n tokens.
+        // at now2, the bucket would be partially filled if we add n tokens.
+
+        match allow_at {
+            Some(allow_at) if now < allow_at => Err(allow_at),
+            _ => {
+                self.empty_at = new_empty_at;
+                Ok(())
+            }
+        }
+    }
+}
+
+pub struct RateLimiter {
+    pub config: LeakyBucketConfig,
+    pub state: Mutex<LeakyBucketState>,
+    /// a queue to provide this fair ordering.
+    pub queue: Notify,
+}
+
+struct Requeue<'a>(&'a Notify);
+
+impl Drop for Requeue<'_> {
+    fn drop(&mut self) {
+        self.0.notify_one();
+    }
+}
+
+impl RateLimiter {
+    pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
+        RateLimiter {
+            state: Mutex::new(LeakyBucketState::with_initial_tokens(
+                &config,
+                initial_tokens,
+            )),
+            config,
+            queue: {
+                let queue = Notify::new();
+                queue.notify_one();
+                queue
+            },
+        }
+    }
+
+    pub fn steady_rps(&self) -> f64 {
+        self.config.cost.as_secs_f64().recip()
+    }
+
+    /// returns true if we did throttle
+    pub async fn acquire(&self, count: usize) -> bool {
+        let mut throttled = false;
+
+        let start = tokio::time::Instant::now();
+
+        // wait until we are the first in the queue
+        let mut notified = std::pin::pin!(self.queue.notified());
+        if !notified.as_mut().enable() {
+            throttled = true;
+            notified.await;
+        }
+
+        // notify the next waiter in the queue when we are done.
+        let _guard = Requeue(&self.queue);
+
+        loop {
+            let res = self
+                .state
+                .lock()
+                .unwrap()
+                .add_tokens(&self.config, start, count as f64);
+            match res {
+                Ok(()) => return throttled,
+                Err(ready_at) => {
+                    throttled = true;
+                    tokio::time::sleep_until(ready_at).await;
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use tokio::time::Instant;
+
+    use super::{LeakyBucketConfig, LeakyBucketState};
+
+    #[tokio::test(start_paused = true)]
+    async fn check() {
+        let config = LeakyBucketConfig {
+            // average 100rps
+            cost: Duration::from_millis(10),
+            // burst up to 100 requests
+            bucket_width: Duration::from_millis(1000),
+        };
+
+        let mut state = LeakyBucketState {
+            empty_at: Instant::now(),
+        };
+
+        // supports burst
+        {
+            // should work for 100 requests this instant
+            for _ in 0..100 {
+                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
+            }
+            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
+        }
+
+        // doesn't overfill
+        {
+            // after 1s we should have an empty bucket again.
+            tokio::time::advance(Duration::from_secs(1)).await;
+            assert!(state.bucket_is_empty(Instant::now()));
+
+            // after 1s more, we should not over count the tokens and allow more than 200 requests.
+            tokio::time::advance(Duration::from_secs(1)).await;
+            for _ in 0..100 {
+                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
+            }
+            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
+        }
+
+        // supports sustained rate over a long period
+        {
+            tokio::time::advance(Duration::from_secs(1)).await;
+
+            // should sustain 100rps
+            for _ in 0..2000 {
+                tokio::time::advance(Duration::from_millis(10)).await;
+                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
+            }
+        }
+
+        // supports requesting more tokens than can be stored in the bucket
+        // we just wait a little bit longer upfront.
+        {
+            // start the bucket completely empty
+            tokio::time::advance(Duration::from_secs(5)).await;
+            assert!(state.bucket_is_empty(Instant::now()));
+
+            // requesting 200 tokens of space should take 200*cost = 2s
+            // but we already have 1s available, so we wait 1s from start.
+            let start = Instant::now();
+
+            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_secs(1));
+
+            tokio::time::advance(Duration::from_millis(500)).await;
+            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(500));
+
+            tokio::time::advance(Duration::from_millis(500)).await;
+            state.add_tokens(&config, start, 200.0).unwrap();
+
+            // bucket should be completely full now
+            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
+        }
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index f4fc0ba57b..218dd468b1 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -71,6 +71,7 @@ pub mod postgres_client;
 
 pub mod tracing_span_assert;
 
+pub mod leaky_bucket;
 pub mod rate_limit;
 
 /// Simple once-barrier and a guard which keeps barrier awaiting.
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 85c5e24afc..9c02ce3fbc 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,7 +37,6 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
-leaky-bucket.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index f3f3d5e3ae..f222e708e1 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -10,6 +10,7 @@ use std::{
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
 use tracing::{error, warn};
+use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
 
@@ -33,8 +34,7 @@ pub struct Throttle<M: Metric> {
 
 pub struct Inner {
     task_kinds: EnumSet<TaskKind>,
-    rate_limiter: Arc<leaky_bucket::RateLimiter>,
-    config: Config,
+    rate_limiter: Arc<RateLimiter>,
 }
 
 pub type Config = pageserver_api::models::ThrottleConfig;
@@ -77,8 +77,7 @@ where
             refill_interval,
             refill_amount,
             max,
-            fair,
-        } = &config;
+        } = config;
         let task_kinds: EnumSet<TaskKind> = task_kinds
             .iter()
             .filter_map(|s| match TaskKind::from_str(s) {
@@ -93,18 +92,21 @@ where
                 }
             })
             .collect();
+
+        // steady rate, we expect `refill_amount` requests per `refill_interval`.
+        // dividing gives us the rps.
+        let rps = f64::from(refill_amount.get()) / refill_interval.as_secs_f64();
+        let config = LeakyBucketConfig::new(rps, f64::from(max));
+
+        // initial tracks how many tokens are available to put in the bucket
+        // we want how many tokens are currently in the bucket
+        let initial_tokens = max - initial;
+
+        let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens));
+
         Inner {
             task_kinds,
-            rate_limiter: Arc::new(
-                leaky_bucket::RateLimiter::builder()
-                    .initial(*initial)
-                    .interval(*refill_interval)
-                    .refill(refill_amount.get())
-                    .max(*max)
-                    .fair(*fair)
-                    .build(),
-            ),
-            config,
+            rate_limiter: Arc::new(rate_limiter),
         }
     }
     pub fn reconfigure(&self, config: Config) {
@@ -127,7 +129,7 @@ where
 
     /// See [`Config::steady_rps`].
     pub fn steady_rps(&self) -> f64 {
-        self.inner.load().config.steady_rps()
+        self.inner.load().rate_limiter.steady_rps()
     }
 
     pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
@@ -136,18 +138,9 @@ where
             return None;
         };
         let start = std::time::Instant::now();
-        let mut did_throttle = false;
-        let acquire = inner.rate_limiter.acquire(key_count);
-        // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
-        let acquire = tokio::task::unconstrained(acquire);
-        let mut acquire = std::pin::pin!(acquire);
-        std::future::poll_fn(|cx| {
-            use std::future::Future;
-            let poll = acquire.as_mut().poll(cx);
-            did_throttle = did_throttle || poll.is_pending();
-            poll
-        })
-        .await;
+
+        let did_throttle = inner.rate_limiter.acquire(key_count).await;
+
         self.count_accounted.fetch_add(1, Ordering::Relaxed);
         if did_throttle {
             self.count_throttled.fetch_add(1, Ordering::Relaxed);
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index e5f5867998..6e38f89458 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -10,7 +10,5 @@ pub(crate) use limit_algorithm::{
 };
 pub(crate) use limiter::GlobalRateLimiter;
 
-pub use leaky_bucket::{
-    EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
-};
+pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
 pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index fa8cb75256..bf4d85f2e4 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -8,6 +8,7 @@ use dashmap::DashMap;
 use rand::{thread_rng, Rng};
 use tokio::time::Instant;
 use tracing::info;
+use utils::leaky_bucket::LeakyBucketState;
 
 use crate::intern::EndpointIdInt;
 
@@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
 
 pub struct LeakyBucketRateLimiter<Key> {
     map: DashMap<Key, LeakyBucketState, RandomState>,
-    config: LeakyBucketConfig,
+    config: utils::leaky_bucket::LeakyBucketConfig,
     access_count: AtomicUsize,
 }
 
@@ -29,7 +30,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
     pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
         Self {
             map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
-            config,
+            config: config.into(),
             access_count: AtomicUsize::new(0),
         }
     }
@@ -42,12 +43,12 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
             self.do_gc(now);
         }
 
-        let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
-            time: now,
-            filled: 0.0,
-        });
+        let mut entry = self
+            .map
+            .entry(key)
+            .or_insert_with(|| LeakyBucketState { empty_at: now });
 
-        entry.check(&self.config, now, n as f64)
+        entry.add_tokens(&self.config, now, n as f64).is_ok()
     }
 
     fn do_gc(&self, now: Instant) {
@@ -59,7 +60,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
         let shard = thread_rng().gen_range(0..n);
         self.map.shards()[shard]
             .write()
-            .retain(|_, value| !value.get_mut().update(&self.config, now));
+            .retain(|_, value| !value.get().bucket_is_empty(now));
     }
 }
 
@@ -68,11 +69,6 @@ pub struct LeakyBucketConfig {
     pub max: f64,
 }
 
-pub struct LeakyBucketState {
-    filled: f64,
-    time: Instant,
-}
-
 #[cfg(test)]
 impl LeakyBucketConfig {
     pub(crate) fn new(rps: f64, max: f64) -> Self {
@@ -82,40 +78,9 @@ impl LeakyBucketConfig {
     }
 }
 
-impl LeakyBucketState {
-    pub(crate) fn new() -> Self {
-        Self {
-            filled: 0.0,
-            time: Instant::now(),
-        }
-    }
-
-    /// updates the timer and returns true if the bucket is empty
-    fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
-        let drain = now.duration_since(self.time);
-        let drain = drain.as_secs_f64() * info.rps;
-
-        self.filled = (self.filled - drain).clamp(0.0, info.max);
-        self.time = now;
-
-        self.filled == 0.0
-    }
-
-    pub(crate) fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
-        self.update(info, now);
-
-        if self.filled + n > info.max {
-            return false;
-        }
-        self.filled += n;
-
-        true
-    }
-}
-
-impl Default for LeakyBucketState {
-    fn default() -> Self {
-        Self::new()
+impl From<LeakyBucketConfig> for utils::leaky_bucket::LeakyBucketConfig {
+    fn from(config: LeakyBucketConfig) -> Self {
+        utils::leaky_bucket::LeakyBucketConfig::new(config.rps, config.max)
     }
 }
 
@@ -125,48 +90,55 @@ mod tests {
     use std::time::Duration;
 
     use tokio::time::Instant;
+    use utils::leaky_bucket::LeakyBucketState;
 
-    use super::{LeakyBucketConfig, LeakyBucketState};
+    use super::LeakyBucketConfig;
 
     #[tokio::test(start_paused = true)]
     async fn check() {
-        let info = LeakyBucketConfig::new(500.0, 2000.0);
-        let mut bucket = LeakyBucketState::new();
+        let config: utils::leaky_bucket::LeakyBucketConfig =
+            LeakyBucketConfig::new(500.0, 2000.0).into();
+        assert_eq!(config.cost, Duration::from_millis(2));
+        assert_eq!(config.bucket_width, Duration::from_secs(4));
+
+        let mut bucket = LeakyBucketState {
+            empty_at: Instant::now(),
+        };
 
         // should work for 2000 requests this second
         for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
         }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
-        assert_eq!(bucket.filled, 2000.0);
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
+        assert_eq!(bucket.empty_at - Instant::now(), config.bucket_width);
 
         // in 1ms we should drain 0.5 tokens.
         // make sure we don't lose any tokens
         tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
         tokio::time::advance(Duration::from_millis(1)).await;
-        assert!(bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
 
         // in 10ms we should drain 5 tokens
         tokio::time::advance(Duration::from_millis(10)).await;
         for _ in 0..5 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
         }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
 
         // in 10s we should drain 5000 tokens
         // but cap is only 2000
         tokio::time::advance(Duration::from_secs(10)).await;
         for _ in 0..2000 {
-            assert!(bucket.check(&info, Instant::now(), 1.0));
+            bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
         }
-        assert!(!bucket.check(&info, Instant::now(), 1.0));
+        bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
 
         // should sustain 500rps
         for _ in 0..2000 {
             tokio::time::advance(Duration::from_millis(10)).await;
             for _ in 0..5 {
-                assert!(bucket.check(&info, Instant::now(), 1.0));
+                bucket.add_tokens(&config, Instant::now(), 1.0).unwrap();
             }
         }
     }
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index a7eda73d4c..bb337d9cc1 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -162,7 +162,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "min_resident_size_override": 23,
         "timeline_get_throttle": {
             "task_kinds": ["PageRequestHandler"],
-            "fair": True,
             "initial": 0,
             "refill_interval": "1s",
             "refill_amount": 1000,
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 111285b40c..4c9eac5cd7 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import uuid
 
@@ -116,3 +117,58 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     assert (
         duration_secs >= 10 * actual_smgr_query_seconds
     ), "smgr metrics should not include throttle wait time"
+
+
+throttle_config_with_field_fair_set = {
+    "task_kinds": ["PageRequestHandler"],
+    "fair": True,
+    "initial": 27,
+    "refill_interval": "43s",
+    "refill_amount": 23,
+    "max": 42,
+}
+
+
+def assert_throttle_config_with_field_fair_set(conf):
+    """
+    Field `fair` is ignored, so, responses don't contain it
+    """
+    without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
+    without_fair.pop("fair")
+
+    assert conf == without_fair
+
+
+def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
+    """
+    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
+    """
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    # with_fair config should still be settable
+    ps_http.set_tenant_config(
+        env.initial_tenant,
+        {"timeline_get_throttle": throttle_config_with_field_fair_set},
+    )
+    conf = ps_http.tenant_config(env.initial_tenant)
+    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
+    assert_throttle_config_with_field_fair_set(
+        conf.tenant_specific_overrides["timeline_get_throttle"]
+    )
+
+
+def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
+    """
+
+    def set_tenant_config(ps_cfg):
+        ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set}
+
+    neon_env_builder.pageserver_config_override = set_tenant_config
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    conf = ps_http.tenant_config(env.initial_tenant)
+    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])

From c7481402a0654f919faeb633d8c07ba17607d2f5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Aug 2024 14:02:27 +0200
Subject: [PATCH 054/159] pageserver: default to 4MiB stack size and add env
 var to control it (#8862)

# Motivation

In https://github.com/neondatabase/neon/pull/8832 I get tokio runtime
worker stack overflow errors in debug builds.

In a similar vein, I had tokio runtimer worker stack overflow when
trying to eliminate `async_trait`
(https://github.com/neondatabase/neon/pull/8296).

The 2MiB default is kind of arbitrary - so this PR bumps it to 4MiB.

It also adds an env var to control it.

# Risk Assessment

With our 4 runtimes, the worst case stack memory usage is `4 (runtimes)
* ($num_cpus (executor threads) + 512 (blocking pool threads)) * 4MiB`.

On i3en.3xlarge, that's `8384 MiB`.
On im4gn.2xlarge, that's `8320 MiB`.
Before this change, it was half that.

Looking at production metrics, we _do_ have the headroom to accomodate
this worst case case.

# Alternatives

The problems only occur with debug builds, so technically we could only
raise the stack size for debug builds.

However, it would be another configuration where `debug != release`.

# Future Work

If we ever enable single runtime mode in prod (=>
https://github.com/neondatabase/neon/issues/7312 ) then the worst case
will drop to 25% of its current value.

Eliminating the use of `tokio::spawn_blocking` / `tokio::fs` in favor of
`tokio-epoll-uring` (=> https://github.com/neondatabase/neon/issues/7370
) would reduce the worst case to `4 (runtimes) * $num_cpus (executor
threads) * 4 MiB`.
---
 pageserver/src/task_mgr.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index ed9e001fd2..6a4e90dd55 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -146,6 +146,12 @@ impl FromStr for TokioRuntimeMode {
     }
 }
 
+static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
+    env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
+        // the default 2MiB are insufficent, especially in debug mode
+        .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
+});
+
 static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
     let thread_name = "pageserver-tokio";
     let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
@@ -164,6 +170,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
             tokio::runtime::Builder::new_current_thread()
                 .thread_name(thread_name)
                 .enable_all()
+                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                 .build()
                 .expect("failed to create one single runtime")
         }
@@ -173,6 +180,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
                 .thread_name(thread_name)
                 .enable_all()
                 .worker_threads(num_workers.get())
+                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                 .build()
                 .expect("failed to create one multi-threaded runtime")
         }
@@ -199,6 +207,7 @@ macro_rules! pageserver_runtime {
                     .thread_name($name)
                     .worker_threads(TOKIO_WORKER_THREADS.get())
                     .enable_all()
+                    .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                     .build()
                     .expect(std::concat!("Failed to create runtime ", $name))
             });

From 96b5c4d33dc76583d1d52fd254a36ee47f6b312a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:54:02 +0200
Subject: [PATCH 055/159] Don't unarchive a timeline if its ancestor is
 archived (#8853)

If a timeline unarchival request comes in, give an error if the parent
timeline is archived. This prevents us from the situation of having an
archived timeline with children that are not archived.

Follow up of #8824

Part of #8088

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/http/routes.rs                |  3 +++
 pageserver/src/tenant.rs                     | 19 +++++++++++---
 pageserver/src/tenant/timeline.rs            |  5 ++++
 test_runner/regress/test_timeline_archive.py | 26 ++++++++++++++++++++
 4 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cb7c2b60ef..f18f0b730c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -324,6 +324,9 @@ impl From<crate::tenant::TimelineArchivalError> for ApiError {
         match value {
             NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
             Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            e @ HasArchivedParent(_) => {
+                ApiError::PreconditionFailed(e.to_string().into_boxed_str())
+            }
             HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
                 format!(
                     "Cannot archive timeline which has non-archived child timelines: {children:?}"
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 60ab242ffc..fb30857ddf 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -509,6 +509,9 @@ pub enum TimelineArchivalError {
     #[error("Timeout")]
     Timeout,
 
+    #[error("ancestor is archived: {}", .0)]
+    HasArchivedParent(TimelineId),
+
     #[error("HasUnarchivedChildren")]
     HasUnarchivedChildren(Vec<TimelineId>),
 
@@ -524,6 +527,7 @@ impl Debug for TimelineArchivalError {
         match self {
             Self::NotFound => write!(f, "NotFound"),
             Self::Timeout => write!(f, "Timeout"),
+            Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
             Self::HasUnarchivedChildren(c) => {
                 f.debug_tuple("HasUnarchivedChildren").field(c).finish()
             }
@@ -1369,11 +1373,20 @@ impl Tenant {
         let timeline = {
             let timelines = self.timelines.lock().unwrap();
 
-            let timeline = match timelines.get(&timeline_id) {
-                Some(t) => t,
-                None => return Err(TimelineArchivalError::NotFound),
+            let Some(timeline) = timelines.get(&timeline_id) else {
+                return Err(TimelineArchivalError::NotFound);
             };
 
+            if state == TimelineArchivalState::Unarchived {
+                if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
+                    if ancestor_timeline.is_archived() == Some(true) {
+                        return Err(TimelineArchivalError::HasArchivedParent(
+                            ancestor_timeline.timeline_id,
+                        ));
+                    }
+                }
+            }
+
             // Ensure that there are no non-archived child timelines
             let children: Vec<TimelineId> = timelines
                 .iter()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8096a0d18c..63d59e06a5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -867,6 +867,11 @@ impl Timeline {
             .map(|ancestor| ancestor.timeline_id)
     }
 
+    /// Get the ancestor timeline
+    pub(crate) fn ancestor_timeline(&self) -> Option<&Arc<Timeline>> {
+        self.ancestor_timeline.as_ref()
+    }
+
     /// Get the bytes written since the PITR cutoff on this branch, and
     /// whether this branch's ancestor_lsn is within its parent's PITR.
     pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index b774c7c9fe..7f158ad251 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -94,3 +94,29 @@ def test_timeline_archive(neon_simple_env: NeonEnv):
         timeline_id=parent_timeline_id,
         state=TimelineArchivalState.ARCHIVED,
     )
+
+    # Test that the leaf can't be unarchived
+    with pytest.raises(
+        PageserverApiException,
+        match="ancestor is archived",
+    ) as exc:
+        assert timeline_path.exists()
+
+        ps_http.timeline_archival_config(
+            tenant_id=env.initial_tenant,
+            timeline_id=leaf_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+
+    # Unarchive works for the leaf if the parent gets unarchived first
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=parent_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )

From a8fbc63be2a628297102fe1d85557f3423308117 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 29 Aug 2024 15:06:13 +0200
Subject: [PATCH 056/159] tenant background loops: periodic log message if
 long-running iteration (#8832)

refs https://github.com/neondatabase/neon/issues/7524

Problem
-------

When browsing Pageserver logs, background loop iterations that take a
long time are hard to spot / easy to miss because they tend to not
produce any log messages unless:

- they overrun their period, but that's only one message when the
iteration completes late
- they do something that produces logs (e.g., create image layers)

Further, a slow iteration that is still running does will not
log nor bump the metrics of `warn_when_period_overrun`until _after_
it has finished. Again, that makes a still-running iteration hard to
spot.

Solution
--------

This PR adds a wrapper around the per-tenant background loops
that, while a slow iteration is ongoing, emit a log message
every $period.
---
 pageserver/src/tenant/tasks.rs | 112 ++++++++++++++++++++++++---------
 1 file changed, 83 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 12f080f3c1..f5680ced90 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -192,20 +192,28 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             }
 
-            let started_at = Instant::now();
 
-            let sleep_duration = if period == Duration::ZERO {
+
+            let sleep_duration;
+            if period == Duration::ZERO {
                 #[cfg(not(feature = "testing"))]
                 info!("automatic compaction is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                Duration::from_secs(10)
+                sleep_duration = Duration::from_secs(10)
             } else {
+                let iteration = Iteration {
+                    started_at: Instant::now(),
+                    period,
+                    kind: BackgroundLoopKind::Compaction,
+                };
+
                 // Run compaction
-                match tenant.compaction_iteration(&cancel, &ctx).await {
+                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
+                match output {
                     Ok(has_pending_task) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::ZERO } else { period }
+                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
                     }
                     Err(e) => {
                         let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -221,16 +229,14 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                             &wait_duration,
                             cancel.is_cancelled(),
                         );
-                        wait_duration
+                        sleep_duration = wait_duration;
                     }
                 }
+
+                // the duration is recorded by performance tests by enabling debug in this function
+                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
             };
 
-            let elapsed = started_at.elapsed();
-            warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction);
-
-            // the duration is recorded by performance tests by enabling debug in this function
-            tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
 
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
@@ -368,23 +374,27 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             }
 
-            let started_at = Instant::now();
-
             let gc_horizon = tenant.get_gc_horizon();
-            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
+            let sleep_duration;
+            if period == Duration::ZERO || gc_horizon == 0 {
                 #[cfg(not(feature = "testing"))]
                 info!("automatic GC is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                Duration::from_secs(10)
+                sleep_duration = Duration::from_secs(10);
             } else {
+                let iteration = Iteration {
+                    started_at: Instant::now(),
+                    period,
+                    kind: BackgroundLoopKind::Gc,
+                };
                 // Run gc
-                let res = tenant
-                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
+                let IterationResult { output, elapsed: _ } =
+                    iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
                     .await;
-                match res {
+                match output {
                     Ok(_) => {
                         error_run_count = 0;
-                        period
+                        sleep_duration = period;
                     }
                     Err(crate::tenant::GcError::TenantCancelled) => {
                         return;
@@ -408,13 +418,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                             error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
                         }
 
-                        wait_duration
+                        sleep_duration = wait_duration;
                     }
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
-
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
                 .is_ok()
@@ -468,14 +476,12 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                 break;
             }
 
-            let started_at = Instant::now();
-            tenant.ingest_housekeeping().await;
-
-            warn_when_period_overrun(
-                started_at.elapsed(),
+            let iteration = Iteration {
+                started_at: Instant::now(),
                 period,
-                BackgroundLoopKind::IngestHouseKeeping,
-            );
+                kind: BackgroundLoopKind::IngestHouseKeeping,
+            };
+            iteration.run(tenant.ingest_housekeeping()).await;
         }
     }
     .await;
@@ -553,6 +559,54 @@ pub(crate) async fn delay_by_lease_length(
     }
 }
 
+struct Iteration {
+    started_at: Instant,
+    period: Duration,
+    kind: BackgroundLoopKind,
+}
+
+struct IterationResult<O> {
+    output: O,
+    elapsed: Duration,
+}
+
+impl Iteration {
+    #[instrument(skip_all)]
+    pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
+    where
+        Fut: std::future::Future<Output = O>,
+    {
+        let Self {
+            started_at,
+            period,
+            kind,
+        } = self;
+
+        let mut fut = std::pin::pin!(fut);
+
+        // Wrap `fut` into a future that logs a message every `period` so that we get a
+        // very obvious breadcrumb in the logs _while_ a slow iteration is happening.
+        let liveness_logger = async move {
+            loop {
+                match tokio::time::timeout(period, &mut fut).await {
+                    Ok(x) => return x,
+                    Err(_) => {
+                        // info level as per the same rationale why warn_when_period_overrun is info
+                        // =>  https://github.com/neondatabase/neon/pull/5724
+                        info!("still running");
+                    }
+                }
+            }
+        };
+
+        let output = liveness_logger.await;
+
+        let elapsed = started_at.elapsed();
+        warn_when_period_overrun(elapsed, period, kind);
+
+        IterationResult { output, elapsed }
+    }
+}
 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(
     elapsed: Duration,

From 7ce49fe6e312d0bbfcf27fe3f41b8ad70d8725b0 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 29 Aug 2024 14:20:15 +0100
Subject: [PATCH 057/159] proxy: improve test performance (#8863)

Some tests were very slow and some tests occasionally stalled. This PR
improves some test performance and replaces the custom threadpool in
order to fix the stalling of tests.
---
 proxy/src/auth/backend/jwt.rs |  69 ++++++-
 proxy/src/context/parquet.rs  |  34 ----
 proxy/src/metrics.rs          |   8 +-
 proxy/src/scram/countmin.rs   |  26 +--
 proxy/src/scram/pbkdf2.rs     |   4 +-
 proxy/src/scram/threadpool.rs | 363 +++++++++++-----------------------
 6 files changed, 199 insertions(+), 305 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index e98da82053..1f44e4af5d 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -500,6 +500,7 @@ mod tests {
     use hyper1::service::service_fn;
     use hyper_util::rt::TokioIo;
     use rand::rngs::OsRng;
+    use rsa::pkcs8::DecodePrivateKey;
     use signature::Signer;
     use tokio::net::TcpListener;
 
@@ -517,8 +518,8 @@ mod tests {
         (sk, jwk)
     }
 
-    fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
-        let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
+    fn new_rsa_jwk(key: &str, kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
+        let sk = rsa::RsaPrivateKey::from_pkcs8_pem(key).unwrap();
         let pk = sk.to_public_key().into();
         let jwk = jose_jwk::Jwk {
             key: jose_jwk::Key::Rsa(pk),
@@ -569,10 +570,70 @@ mod tests {
         format!("{payload}.{sig}")
     }
 
+    // RSA key gen is slow....
+    const RS1: &str = "-----BEGIN PRIVATE KEY-----
+MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDNuWBIWTlo+54Y
+aifpGInIrpv6LlsbI/2/2CC81Arlx4RsABORklgA9XSGwaCbHTshHsfd1S916JwA
+SpjyPQYWfqo6iAV8a4MhjIeJIkRr74prDCSzOGZvIc6VaGeCIb9clf3HSrPHm3hA
+cfLMB8/p5MgoxERPDOIn3XYoS9SEEuP7l0LkmEZMerg6W6lDjQRDny0Lb50Jky9X
+mDqnYXBhs99ranbwL5vjy0ba6OIeCWFJme5u+rv5C/P0BOYrJfGxIcEoKa8Ukw5s
+PlM+qrz9ope1eOuXMNNdyFDReNBUyaM1AwBAayU5rz57crer7K/UIofaJ42T4cMM
+nx/SWfBNAgMBAAECggEACqdpBxYn1PoC6/zDaFzu9celKEWyTiuE/qRwvZa1ocS9
+ZOJ0IPvVNud/S2NHsADJiSOQ8joSJScQvSsf1Ju4bv3MTw+wSQtAVUJz2nQ92uEi
+5/xPAkEPfP3hNvebNLAOuvrBk8qYmOPCTIQaMNrOt6wzeXkAmJ9wLuRXNCsJLHW+
+KLpf2WdgTYxqK06ZiJERFgJ2r1MsC2IgTydzjOAdEIrtMarerTLqqCpwFrk/l0cz
+1O2OAb17ZxmhuzMhjNMin81c8F2fZAGMeOjn92Jl5kUsYw/pG+0S8QKlbveR/fdP
+We2tJsgXw2zD0q7OJpp8NXS2yddrZGyysYsof983wQKBgQD2McqNJqo+eWL5zony
+UbL19loYw0M15EjhzIuzW1Jk0rPj65yQyzpJ6pqicRuWr34MvzCx+ZHM2b3jSiNu
+GES2fnC7xLIKyeRxfqsXF71xz+6UStEGRQX27r1YWEtyQVuBhvlqB+AGWP3PYAC+
+HecZecnZ+vcihJ2K3+l5O3paVQKBgQDV6vKH5h2SY9vgO8obx0P7XSS+djHhmPuU
+f8C/Fq6AuRbIA1g04pzuLU2WS9T26eIjgM173uVNg2TuqJveWzz+CAAp6nCR6l24
+DBg49lMGCWrMo4FqPG46QkUqvK8uSj42GkX/e5Rut1Gyu0209emeM6h2d2K15SvY
+9563tYSmGQKBgQDwcH5WTi20KA7e07TroJi8GKWzS3gneNUpGQBS4VxdtV4UuXXF
+/4TkzafJ/9cm2iurvUmMd6XKP9lw0mY5zp/E70WgTCBp4vUlVsU3H2tYbO+filYL
+3ntNx6nKTykX4/a/UJfj0t8as+zli+gNxNx/h+734V9dKdFG4Rl+2fTLpQKBgQCE
+qJkTEe+Q0wCOBEYICADupwqcWqwAXWDW7IrZdfVtulqYWwqecVIkmk+dPxWosc4d
+ekjz4nyNH0i+gC15LVebqdaAJ/T7aD4KXuW+nXNLMRfcJCGjgipRUruWD0EMEdqW
+rqBuGXMpXeH6VxGPgVkJVLvKC6tZZe9VM+pnvteuMQKBgQC8GaL+Lz+al4biyZBf
+JE8ekWrIotq/gfUBLP7x70+PB9bNtXtlgmTvjgYg4jiu3KR/ZIYYQ8vfVgkb6tDI
+rWGZw86Pzuoi1ppg/pYhKk9qrmCIT4HPEXbHl7ATahu2BOCIU3hybjTh2lB6LbX9
+8LMFlz1QPqSZYN/A/kOcLBfa3A==
+-----END PRIVATE KEY-----
+";
+    const RS2: &str = "-----BEGIN PRIVATE KEY-----
+MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDipm6FIKSRab3J
+HwmK18t7hp+pohllxIDUSPi7S5mIhN/JG2Plq2Lp746E/fuT8dcBF2R4sJlG2L0J
+zmxOvBU/i/sQF9s1i4CEfg05k2//gKENIEsF3pMMmrH+mcZi0TTD6rezHpdVxPHk
+qWxSyOCtIJV29X+wxPwAB59kQFHzy2ooPB1isZcpE8tO0KthAM+oZ3KuCwE0++cO
+IWLeq9aPwyKhtip/xjTMxd1kzdKh592mGSyzr9D0QSWOYFGvgJXANDdiPdhSSOLt
+ECWPNPlm2FQvGGvYYBafUqz7VumKHE6x8J6lKdYa2J0ZdDzCIo2IHzlxe+RZNgwy
+uAD2jhVxAgMBAAECggEAbsZHWBu3MzcKQiVARbLoygvnN0J5xUqAaMDtiKUPejDv
+K1yOu67DXnDuKEP2VL2rhuYG/hHaKE1AP227c9PrUq6424m9YvM2sgrlrdFIuQkG
+LeMtp8W7+zoUasp/ssZrUqICfLIj5xCl5UuFHQT/Ar7dLlIYwa3VOLKBDb9+Dnfe
+QH5/So4uMXG6vw34JN9jf+eAc8Yt0PeIz62ycvRwdpTJQ0MxZN9ZKpCAQp+VTuXT
+zlzNvDMilabEdqUvAyGyz8lBLNl0wdaVrqPqAEWM5U45QXsdFZknWammP7/tijeX
+0z+Bi0J0uSEU5X502zm7GArj/NNIiWMcjmDjwUUhwQKBgQD9C2GoqxOxuVPYqwYR
++Jz7f2qMjlSP8adA5Lzuh8UKXDp8JCEQC8ryweLzaOKS9C5MAw+W4W2wd4nJoQI1
+P1dgGvBlfvEeRHMgqWtq7FuTsjSe7e0uSEkC4ngDb4sc0QOpv15cMuEz+4+aFLPL
+x29EcHWAaBX+rkid3zpQHFU4eQKBgQDlTCEqRuXwwa3V+Sq+mNWzD9QIGtD87TH/
+FPO/Ij/cK2+GISgFDqhetiGTH4qrvPL0psPT+iH5zGFYcoFmTtwLdWQJdxhxz0bg
+iX/AceyX5e1Bm+ThT36sU83NrxKPkrdk6jNmr2iUF1OTzTwUKOYdHOPZqdMPfF4M
+4XAaWVT2uQKBgQD4nKcNdU+7LE9Rr+4d1/o8Klp/0BMK/ayK2HE7lc8kt6qKb2DA
+iCWUTqPw7Fq3cQrPia5WWhNP7pJEtFkcAaiR9sW7onW5fBz0uR+dhK0QtmR2xWJj
+N4fsOp8ZGQ0/eae0rh1CTobucLkM9EwV6VLLlgYL67e4anlUCo8bSEr+WQKBgQCB
+uf6RgqcY/RqyklPCnYlZ0zyskS9nyXKd1GbK3j+u+swP4LZZlh9f5j88k33LCA2U
+qLzmMwAB6cWxWqcnELqhqPq9+ClWSmTZKDGk2U936NfAZMirSGRsbsVi9wfTPriP
+WYlXMSpDjqb0WgsBhNob4npubQxCGKTFOM5Jufy90QKBgB0Lte1jX144uaXx6dtB
+rjXNuWNir0Jy31wHnQuCA+XnfUgPcrKmRLm8taMbXgZwxkNvgFkpUWU8aPEK08Ne
+X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
+5JiconnI5aLek0QVPoFaVXFa
+-----END PRIVATE KEY-----
+";
+
     #[tokio::test]
     async fn renew() {
-        let (rs1, jwk1) = new_rsa_jwk("1".into());
-        let (rs2, jwk2) = new_rsa_jwk("2".into());
+        let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into());
+        let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into());
         let (ec1, jwk3) = new_ec_jwk("3".into());
         let (ec2, jwk4) = new_ec_jwk("4".into());
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 88caa9a316..c6f83fd069 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -613,40 +613,6 @@ mod tests {
         tmpdir.close().unwrap();
     }
 
-    #[tokio::test]
-    async fn verify_parquet_min_compression() {
-        let tmpdir = camino_tempfile::tempdir().unwrap();
-
-        let config = ParquetConfig {
-            propeties: Arc::new(
-                WriterProperties::builder()
-                    .set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default()))
-                    .build(),
-            ),
-            rows_per_group: 2_000,
-            file_size: 1_000_000,
-            max_duration: time::Duration::from_secs(20 * 60),
-            test_remote_failures: 0,
-        };
-
-        let rx = random_stream(50_000);
-        let file_stats = run_test(tmpdir.path(), config, rx).await;
-
-        // with compression, there are fewer files with more rows per file
-        assert_eq!(
-            file_stats,
-            [
-                (1223214, 5, 10000),
-                (1229364, 5, 10000),
-                (1231158, 5, 10000),
-                (1230520, 5, 10000),
-                (1221798, 5, 10000)
-            ]
-        );
-
-        tmpdir.close().unwrap();
-    }
-
     #[tokio::test]
     async fn verify_parquet_strong_compression() {
         let tmpdir = camino_tempfile::tempdir().unwrap();
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index ccef88231b..2da7eac580 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,8 +4,8 @@ use lasso::ThreadedRodeo;
 use measured::{
     label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
-    LabelGroup, MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
@@ -548,6 +548,7 @@ pub enum RedisEventsCount {
 }
 
 pub struct ThreadPoolWorkers(usize);
+#[derive(Copy, Clone)]
 pub struct ThreadPoolWorkerId(pub usize);
 
 impl LabelValue for ThreadPoolWorkerId {
@@ -613,9 +614,6 @@ impl FixedCardinalitySet for ThreadPoolWorkers {
 #[derive(MetricGroup)]
 #[metric(new(workers: usize))]
 pub struct ThreadPoolMetrics {
-    pub injector_queue_depth: Gauge,
-    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
     #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
     pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
     #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index 255694b33e..64ee0135e1 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -83,10 +83,10 @@ mod tests {
         let mut ids = vec![];
 
         for _ in 0..n {
-            // number of insert operations
-            let n = rng.gen_range(1..100);
             // number to insert at once
-            let m = rng.gen_range(1..4096);
+            let n = rng.gen_range(1..4096);
+            // number of insert operations
+            let m = rng.gen_range(1..100);
 
             let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
             ids.push((id, n, m));
@@ -102,17 +102,11 @@ mod tests {
         let mut ids2 = ids.clone();
         while !ids2.is_empty() {
             ids2.shuffle(&mut rng);
-
-            let mut i = 0;
-            while i < ids2.len() {
-                sketch.inc_and_return(&ids2[i].0, ids2[i].1);
-                ids2[i].2 -= 1;
-                if ids2[i].2 == 0 {
-                    ids2.remove(i);
-                } else {
-                    i += 1;
-                }
-            }
+            ids2.retain_mut(|id| {
+                sketch.inc_and_return(&id.0, id.1);
+                id.2 -= 1;
+                id.2 > 0
+            });
         }
 
         let mut within_p = 0;
@@ -144,8 +138,8 @@ mod tests {
         // probably numbers are too small to truly represent the probabilities.
         assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
         assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
-        assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
-        assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
+        assert_eq!(eval_precision(100, 4096.0, 0.1), 96);
+        assert_eq!(eval_precision(1000, 4096.0, 0.1), 988);
     }
 
     // returns memory usage in bytes, and the time complexity per insert.
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index d5ed9002ad..4cf76c8452 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -75,7 +75,7 @@ mod tests {
         let salt = b"sodium chloride";
         let pass = b"Ne0n_!5_50_C007";
 
-        let mut job = Pbkdf2::start(pass, salt, 600000);
+        let mut job = Pbkdf2::start(pass, salt, 60000);
         let hash = loop {
             let std::task::Poll::Ready(hash) = job.turn() else {
                 continue;
@@ -83,7 +83,7 @@ mod tests {
             break hash;
         };
 
-        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
+        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 60000);
         assert_eq!(hash, expected);
     }
 }
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 262c6d146e..d73a927995 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -4,17 +4,19 @@
 //! 1. Fairness per endpoint.
 //! 2. Yield support for high iteration counts.
 
-use std::sync::{
-    atomic::{AtomicU64, Ordering},
-    Arc,
+use std::{
+    cell::RefCell,
+    future::Future,
+    pin::Pin,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc, Weak,
+    },
+    task::{Context, Poll},
 };
 
-use crossbeam_deque::{Injector, Stealer, Worker};
-use itertools::Itertools;
-use parking_lot::{Condvar, Mutex};
 use rand::Rng;
 use rand::{rngs::SmallRng, SeedableRng};
-use tokio::sync::oneshot;
 
 use crate::{
     intern::EndpointIdInt,
@@ -25,273 +27,146 @@ use crate::{
 use super::pbkdf2::Pbkdf2;
 
 pub struct ThreadPool {
-    queue: Injector<JobSpec>,
-    stealers: Vec<Stealer<JobSpec>>,
-    parkers: Vec<(Condvar, Mutex<ThreadState>)>,
-    /// bitpacked representation.
-    /// lower 8 bits = number of sleeping threads
-    /// next 8 bits = number of idle threads (searching for work)
-    counters: AtomicU64,
-
+    runtime: Option<tokio::runtime::Runtime>,
     pub metrics: Arc<ThreadPoolMetrics>,
 }
 
-#[derive(PartialEq)]
-enum ThreadState {
-    Parked,
-    Active,
+/// How often to reset the sketch values
+const SKETCH_RESET_INTERVAL: u64 = 1021;
+
+thread_local! {
+    static STATE: RefCell<Option<ThreadRt>> = const { RefCell::new(None) };
 }
 
 impl ThreadPool {
     pub fn new(n_workers: u8) -> Arc<Self> {
-        let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
-        let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
+        // rayon would be nice here, but yielding in rayon does not work well afaict.
 
-        let parkers = (0..n_workers)
-            .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
-            .collect_vec();
+        Arc::new_cyclic(|pool| {
+            let pool = pool.clone();
+            let worker_id = AtomicUsize::new(0);
 
-        let pool = Arc::new(Self {
-            queue: Injector::new(),
-            stealers,
-            parkers,
-            // threads start searching for work
-            counters: AtomicU64::new((n_workers as u64) << 8),
-            metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
-        });
+            let runtime = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(n_workers as usize)
+                .on_thread_start(move || {
+                    STATE.with_borrow_mut(|state| {
+                        *state = Some(ThreadRt {
+                            pool: pool.clone(),
+                            id: ThreadPoolWorkerId(worker_id.fetch_add(1, Ordering::Relaxed)),
+                            rng: SmallRng::from_entropy(),
+                            // used to determine whether we should temporarily skip tasks for fairness.
+                            // 99% of estimates will overcount by no more than 4096 samples
+                            countmin: CountMinSketch::with_params(
+                                1.0 / (SKETCH_RESET_INTERVAL as f64),
+                                0.01,
+                            ),
+                            tick: 0,
+                        });
+                    });
+                })
+                .build()
+                .unwrap();
 
-        for (i, worker) in workers.into_iter().enumerate() {
-            let pool = Arc::clone(&pool);
-            std::thread::spawn(move || thread_rt(pool, worker, i));
-        }
-
-        pool
+            Self {
+                runtime: Some(runtime),
+                metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+            }
+        })
     }
 
     pub(crate) fn spawn_job(
         &self,
         endpoint: EndpointIdInt,
         pbkdf2: Pbkdf2,
-    ) -> oneshot::Receiver<[u8; 32]> {
-        let (tx, rx) = oneshot::channel();
-
-        let queue_was_empty = self.queue.is_empty();
-
-        self.metrics.injector_queue_depth.inc();
-        self.queue.push(JobSpec {
-            response: tx,
-            pbkdf2,
-            endpoint,
-        });
-
-        // inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
-        let counts = self.counters.load(Ordering::SeqCst);
-        let num_awake_but_idle = (counts >> 8) & 0xff;
-        let num_sleepers = counts & 0xff;
-
-        // If the queue is non-empty, then we always wake up a worker
-        // -- clearly the existing idle jobs aren't enough. Otherwise,
-        // check to see if we have enough idle workers.
-        if !queue_was_empty || num_awake_but_idle == 0 {
-            let num_to_wake = Ord::min(1, num_sleepers);
-            self.wake_any_threads(num_to_wake);
-        }
-
-        rx
-    }
-
-    #[cold]
-    fn wake_any_threads(&self, mut num_to_wake: u64) {
-        if num_to_wake > 0 {
-            for i in 0..self.parkers.len() {
-                if self.wake_specific_thread(i) {
-                    num_to_wake -= 1;
-                    if num_to_wake == 0 {
-                        return;
-                    }
-                }
-            }
-        }
-    }
-
-    fn wake_specific_thread(&self, index: usize) -> bool {
-        let (condvar, lock) = &self.parkers[index];
-
-        let mut state = lock.lock();
-        if *state == ThreadState::Parked {
-            condvar.notify_one();
-
-            // When the thread went to sleep, it will have incremented
-            // this value. When we wake it, its our job to decrement
-            // it. We could have the thread do it, but that would
-            // introduce a delay between when the thread was
-            // *notified* and when this counter was decremented. That
-            // might mislead people with new work into thinking that
-            // there are sleeping threads that they should try to
-            // wake, when in fact there is nothing left for them to
-            // do.
-            self.counters.fetch_sub(1, Ordering::SeqCst);
-            *state = ThreadState::Active;
-
-            true
-        } else {
-            false
-        }
-    }
-
-    fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
-        // announce thread as idle
-        self.counters.fetch_add(256, Ordering::SeqCst);
-
-        // try steal from the global queue
-        loop {
-            match self.queue.steal_batch_and_pop(worker) {
-                crossbeam_deque::Steal::Success(job) => {
-                    self.metrics
-                        .injector_queue_depth
-                        .set(self.queue.len() as i64);
-                    // no longer idle
-                    self.counters.fetch_sub(256, Ordering::SeqCst);
-                    return Some(job);
-                }
-                crossbeam_deque::Steal::Retry => continue,
-                crossbeam_deque::Steal::Empty => break,
-            }
-        }
-
-        // try steal from our neighbours
-        loop {
-            let mut retry = false;
-            let start = rng.gen_range(0..self.stealers.len());
-            let job = (start..self.stealers.len())
-                .chain(0..start)
-                .filter(|i| *i != skip)
-                .find_map(
-                    |victim| match self.stealers[victim].steal_batch_and_pop(worker) {
-                        crossbeam_deque::Steal::Success(job) => Some(job),
-                        crossbeam_deque::Steal::Empty => None,
-                        crossbeam_deque::Steal::Retry => {
-                            retry = true;
-                            None
-                        }
-                    },
-                );
-            if job.is_some() {
-                // no longer idle
-                self.counters.fetch_sub(256, Ordering::SeqCst);
-                return job;
-            }
-            if !retry {
-                return None;
-            }
-        }
+    ) -> tokio::task::JoinHandle<[u8; 32]> {
+        self.runtime
+            .as_ref()
+            .unwrap()
+            .spawn(JobSpec { pbkdf2, endpoint })
     }
 }
 
-fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
-    /// interval when we should steal from the global queue
-    /// so that tail latencies are managed appropriately
-    const STEAL_INTERVAL: usize = 61;
+impl Drop for ThreadPool {
+    fn drop(&mut self) {
+        self.runtime.take().unwrap().shutdown_background();
+    }
+}
 
-    /// How often to reset the sketch values
-    const SKETCH_RESET_INTERVAL: usize = 1021;
+struct ThreadRt {
+    pool: Weak<ThreadPool>,
+    id: ThreadPoolWorkerId,
+    rng: SmallRng,
+    countmin: CountMinSketch,
+    tick: u64,
+}
 
-    let mut rng = SmallRng::from_entropy();
+impl ThreadRt {
+    fn should_run(&mut self, job: &JobSpec) -> bool {
+        let rate = self
+            .countmin
+            .inc_and_return(&job.endpoint, job.pbkdf2.cost());
 
-    // used to determine whether we should temporarily skip tasks for fairness.
-    // 99% of estimates will overcount by no more than 4096 samples
-    let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
-
-    let (condvar, lock) = &pool.parkers[index];
-
-    'wait: loop {
-        // wait for notification of work
-        {
-            let mut lock = lock.lock();
-
-            // queue is empty
-            pool.metrics
-                .worker_queue_depth
-                .set(ThreadPoolWorkerId(index), 0);
-
-            // subtract 1 from idle count, add 1 to sleeping count.
-            pool.counters.fetch_sub(255, Ordering::SeqCst);
-
-            *lock = ThreadState::Parked;
-            condvar.wait(&mut lock);
-        }
-
-        for i in 0.. {
-            let Some(mut job) = worker
-                .pop()
-                .or_else(|| pool.steal(&mut rng, index, &worker))
-            else {
-                continue 'wait;
-            };
-
-            pool.metrics
-                .worker_queue_depth
-                .set(ThreadPoolWorkerId(index), worker.len() as i64);
-
-            // receiver is closed, cancel the task
-            if !job.response.is_closed() {
-                let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
-
-                const P: f64 = 2000.0;
-                // probability decreases as rate increases.
-                // lower probability, higher chance of being skipped
-                //
-                // estimates (rate in terms of 4096 rounds):
-                // rate = 0    => probability = 100%
-                // rate = 10   => probability = 71.3%
-                // rate = 50   => probability = 62.1%
-                // rate = 500  => probability = 52.3%
-                // rate = 1021 => probability = 49.8%
-                //
-                // My expectation is that the pool queue will only begin backing up at ~1000rps
-                // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
-                // are in requests per second.
-                let probability = P.ln() / (P + rate as f64).ln();
-                if pool.queue.len() > 32 || rng.gen_bool(probability) {
-                    pool.metrics
-                        .worker_task_turns_total
-                        .inc(ThreadPoolWorkerId(index));
-
-                    match job.pbkdf2.turn() {
-                        std::task::Poll::Ready(result) => {
-                            let _ = job.response.send(result);
-                        }
-                        std::task::Poll::Pending => worker.push(job),
-                    }
-                } else {
-                    pool.metrics
-                        .worker_task_skips_total
-                        .inc(ThreadPoolWorkerId(index));
-
-                    // skip for now
-                    worker.push(job);
-                }
-            }
-
-            // if we get stuck with a few long lived jobs in the queue
-            // it's better to try and steal from the queue too for fairness
-            if i % STEAL_INTERVAL == 0 {
-                let _ = pool.queue.steal_batch(&worker);
-            }
-
-            if i % SKETCH_RESET_INTERVAL == 0 {
-                sketch.reset();
-            }
-        }
+        const P: f64 = 2000.0;
+        // probability decreases as rate increases.
+        // lower probability, higher chance of being skipped
+        //
+        // estimates (rate in terms of 4096 rounds):
+        // rate = 0    => probability = 100%
+        // rate = 10   => probability = 71.3%
+        // rate = 50   => probability = 62.1%
+        // rate = 500  => probability = 52.3%
+        // rate = 1021 => probability = 49.8%
+        //
+        // My expectation is that the pool queue will only begin backing up at ~1000rps
+        // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
+        // are in requests per second.
+        let probability = P.ln() / (P + rate as f64).ln();
+        self.rng.gen_bool(probability)
     }
 }
 
 struct JobSpec {
-    response: oneshot::Sender<[u8; 32]>,
     pbkdf2: Pbkdf2,
     endpoint: EndpointIdInt,
 }
 
+impl Future for JobSpec {
+    type Output = [u8; 32];
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        STATE.with_borrow_mut(|state| {
+            let state = state.as_mut().expect("should be set on thread startup");
+
+            state.tick = state.tick.wrapping_add(1);
+            if state.tick % SKETCH_RESET_INTERVAL == 0 {
+                state.countmin.reset();
+            }
+
+            if state.should_run(&self) {
+                if let Some(pool) = state.pool.upgrade() {
+                    pool.metrics.worker_task_turns_total.inc(state.id);
+                }
+
+                match self.pbkdf2.turn() {
+                    Poll::Ready(result) => Poll::Ready(result),
+                    // more to do, we shall requeue
+                    Poll::Pending => {
+                        cx.waker().wake_by_ref();
+                        Poll::Pending
+                    }
+                }
+            } else {
+                if let Some(pool) = state.pool.upgrade() {
+                    pool.metrics.worker_task_skips_total.inc(state.id);
+                }
+
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::EndpointId;

From 18bfc43fa706fc6e550d29c539f30c7e5deb1d2b Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 29 Aug 2024 22:01:54 +0800
Subject: [PATCH 058/159] fix(pageserver): add dry-run to force compact API
 (#8859)

Add `dry-run` flag to the compact API

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index f18f0b730c..8cf2c99c09 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1733,6 +1733,10 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
         flags |= CompactFlags::EnhancedGcBottomMostCompaction;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
+        flags |= CompactFlags::DryRun;
+    }
+
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 

From 653a6532a229038683256b08bc6ab5c1b270f52a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 29 Aug 2024 22:07:05 +0800
Subject: [PATCH 059/159] fix(pageserver): reject non-i128 key on the write
 path (#8648)

It's better to reject invalid keys on the write path than storing it and
panic-ing the pageserver.
https://github.com/neondatabase/neon/issues/8636

## Summary of changes

If a key cannot be represented using i128, we don't allow writing that
key into the pageserver.

There are two versions of the check valid function: the normal one that
simply rejects i128 keys, and the stronger one that rejects all keys
that we don't support.

The current behavior when a key gets rejected is that safekeeper will
keep retrying streaming that key to the pageserver. And once such key
gets written, no new computes can be started. Therefore, there could be
a large amount of pageserver warnings if a key cannot be ingested. To
validate this behavior by yourself, the reviewer can (1) use the
stronger version of the valid check (2) run the following SQL.

```
set neon.regress_test_mode = true;
CREATE TABLESPACE regress_tblspace LOCATION '/Users/skyzh/Work/neon-test/tablespace';
CREATE SCHEMA testschema;
CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace;
insert into testschema.foo values (1), (2), (3);
```

For now, I'd like to merge the patch with only rejecting non-i128 keys.
It's still unknown whether the stronger version covers all the cases
that basebackup doesn't support. Furthermore, the behavior of rejecting
a key will produce large amounts of warnings due to safekeeper retry.
Therefore, I'd like to reject the minimum set of keys that we don't
support (i128 ones) for now. (well, erroring out is better than panic on
`to_compact_key`)

The next step is to fix the safekeeper behavior (i.e., on such key
rejections, stop streaming WAL), so that we can properly stop writing.
An alternative solution is to simply drop these keys on the write path.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs      | 35 +++++++++++++++++++++++++----
 pageserver/ctl/src/layers.rs        |  1 +
 pageserver/src/pgdatadir_mapping.rs | 14 +++++++++---
 pageserver/src/tenant/timeline.rs   |  6 +++++
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 77da58d63e..77d744e4da 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -108,14 +108,41 @@ impl Key {
         }
     }
 
+    /// This function checks more extensively what keys we can take on the write path.
+    /// If a key beginning with 00 does not have a global/default tablespace OID, it
+    /// will be rejected on the write path.
+    #[allow(dead_code)]
+    pub fn is_valid_key_on_write_path_strong(&self) -> bool {
+        use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
+        if !self.is_i128_representable() {
+            return false;
+        }
+        if self.field1 == 0
+            && !(self.field2 == GLOBALTABLESPACE_OID
+                || self.field2 == DEFAULTTABLESPACE_OID
+                || self.field2 == 0)
+        {
+            return false; // User defined tablespaces are not supported
+        }
+        true
+    }
+
+    /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
+    /// checks if the key is i128 representable. Note that some keys can be successfully
+    /// ingested into the pageserver, but will cause errors on generating basebackup.
+    pub fn is_valid_key_on_write_path(&self) -> bool {
+        self.is_i128_representable()
+    }
+
+    pub fn is_i128_representable(&self) -> bool {
+        self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
+    }
+
     /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(
-            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
-            "invalid key: {self}",
-        );
+        assert!(self.is_i128_representable(), "invalid key: {self}");
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index a183a3968d..e0f978eaa2 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -90,6 +90,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
     for (k, v) in all {
         let value = cursor.read_blob(v.pos(), ctx).await?;
         println!("key:{} value_len:{}", k, value.len());
+        assert!(k.is_i128_representable(), "invalid key: ");
     }
     // TODO(chi): special handling for last key?
     Ok(())
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b7110d69b6..edcbac970b 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
-use anyhow::{ensure, Context};
+use anyhow::{bail, ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use pageserver_api::key::{
@@ -1791,6 +1791,11 @@ impl<'a> DatadirModification<'a> {
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
         for (key, values) in self.pending_updates.drain() {
+            if !key.is_valid_key_on_write_path() {
+                bail!(
+                    "the request contains data not supported by pageserver at TimelineWriter::put: {}", key
+                );
+            }
             let mut write_batch = Vec::new();
             for (lsn, value_ser_size, value) in values {
                 if key.is_rel_block_key() || key.is_slru_block_key() {
@@ -1843,10 +1848,13 @@ impl<'a> DatadirModification<'a> {
                 .drain()
                 .flat_map(|(key, values)| {
                     values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        (key.to_compact(), lsn, val_ser_size, value)
+                        if !key.is_valid_key_on_write_path() {
+                            bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
+                        }
+                        Ok((key.to_compact(), lsn, val_ser_size, value))
                     })
                 })
-                .collect::<Vec<_>>();
+                .collect::<anyhow::Result<Vec<_>>>()?;
 
             writer.put_batch(batch, ctx).await?;
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 63d59e06a5..35e0825bac 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5746,6 +5746,12 @@ impl<'a> TimelineWriter<'a> {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         use utils::bin_ser::BeSer;
+        if !key.is_valid_key_on_write_path() {
+            bail!(
+                "the request contains data not supported by pageserver at TimelineWriter::put: {}",
+                key
+            );
+        }
         let val_ser_size = value.serialized_size().unwrap() as usize;
         self.put_batch(
             vec![(key.to_compact(), lsn, val_ser_size, value.clone())],

From 8eaa8ad3582b28b67a927f9d40ddab74feb13713 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 29 Aug 2024 18:24:25 +0200
Subject: [PATCH 060/159] Remove async_trait usages from safekeeper and
 neon_local (#8864)

Removes additional async_trait usages from safekeeper and neon_local.

Also removes now redundant dependencies of the `async_trait` crate.

cc earlier work: #6305, #6464, #7303, #7342, #7212, #8296
---
 Cargo.lock                                         |  4 ----
 control_plane/Cargo.toml                           |  1 -
 control_plane/src/safekeeper.rs                    |  7 +++----
 libs/utils/Cargo.toml                              |  1 -
 pageserver/client/Cargo.toml                       |  1 -
 safekeeper/src/control_file.rs                     |  5 ++---
 safekeeper/src/safekeeper.rs                       |  2 --
 safekeeper/src/wal_storage.rs                      | 14 ++++++++------
 .../tests/walproposer_sim/safekeeper_disk.rs       |  2 --
 storage_controller/client/Cargo.toml               |  1 -
 10 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0c246bd258..5af3ef3804 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1333,7 +1333,6 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "camino",
  "clap",
  "comfy-table",
@@ -3790,7 +3789,6 @@ name = "pageserver_client"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "bytes",
  "futures",
  "pageserver_api",
@@ -5952,7 +5950,6 @@ name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "bytes",
  "futures",
  "pageserver_api",
@@ -6955,7 +6952,6 @@ dependencies = [
  "anyhow",
  "arc-swap",
  "async-compression",
- "async-trait",
  "bincode",
  "byteorder",
  "bytes",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 487ac8f047..6fca59b368 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,7 +6,6 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
-async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index a0a73f5609..573f1688d5 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,6 +5,7 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
 use std::time::Duration;
@@ -34,12 +35,10 @@ pub enum SafekeeperHttpError {
 
 type Result<T> = result::Result<T, SafekeeperHttpError>;
 
-#[async_trait::async_trait]
-pub trait ResponseErrorMessageExt: Sized {
-    async fn error_from_body(self) -> Result<Self>;
+pub(crate) trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
 }
 
-#[async_trait::async_trait]
 impl ResponseErrorMessageExt for reqwest::Response {
     async fn error_from_body(self) -> Result<Self> {
         let status = self.status();
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 777fb95ece..19deaab63f 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -14,7 +14,6 @@ testing = ["fail/failpoints"]
 arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
-async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index a938367334..d9b36bf3d4 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 pageserver_api.workspace = true
 thiserror.workspace = true
-async-trait.workspace = true
 reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index c551cd3122..8b252b4ab4 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -7,6 +7,7 @@ use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
 
+use std::future::Future;
 use std::io::Read;
 use std::ops::Deref;
 use std::path::Path;
@@ -31,10 +32,9 @@ pub const CHECKSUM_SIZE: usize = size_of::<u32>();
 
 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
-#[async_trait::async_trait]
 pub trait Storage: Deref<Target = TimelinePersistentState> {
     /// Persist safekeeper state on disk and update internal state.
-    async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()>;
+    fn persist(&mut self, s: &TimelinePersistentState) -> impl Future<Output = Result<()>> + Send;
 
     /// Timestamp of last persist.
     fn last_persist_at(&self) -> Instant;
@@ -188,7 +188,6 @@ impl TimelinePersistentState {
     }
 }
 
-#[async_trait::async_trait]
 impl Storage for FileStorage {
     /// Persists state durably to the underlying storage.
     ///
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 0814d9ba67..486954c7b9 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -971,7 +971,6 @@ mod tests {
         persisted_state: TimelinePersistentState,
     }
 
-    #[async_trait::async_trait]
     impl control_file::Storage for InMemoryState {
         async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
             self.persisted_state = s.clone();
@@ -1003,7 +1002,6 @@ mod tests {
         lsn: Lsn,
     }
 
-    #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
         fn flush_lsn(&self) -> Lsn {
             self.lsn
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index ded8571a3e..6fd7c91a68 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -15,6 +15,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF
 use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
 use std::cmp::{max, min};
+use std::future::Future;
 use std::io::{self, SeekFrom};
 use std::pin::Pin;
 use tokio::fs::{self, remove_file, File, OpenOptions};
@@ -35,7 +36,6 @@ use postgres_ffi::XLOG_BLCKSZ;
 use pq_proto::SystemId;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
-#[async_trait::async_trait]
 pub trait Storage {
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
@@ -44,16 +44,19 @@ pub trait Storage {
     /// the segment and short header at the page of given LSN. This is only used
     /// for timeline initialization because compute will stream data only since
     /// init_lsn. Other segment headers are included in compute stream.
-    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>;
+    fn initialize_first_segment(
+        &mut self,
+        init_lsn: Lsn,
+    ) -> impl Future<Output = Result<()>> + Send;
 
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
-    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
+    fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> impl Future<Output = Result<()>> + Send;
 
     /// Truncate WAL at specified LSN, which must be the end of WAL record.
-    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
+    fn truncate_wal(&mut self, end_pos: Lsn) -> impl Future<Output = Result<()>> + Send;
 
     /// Durably store WAL on disk, up to the last written WAL record.
-    async fn flush_wal(&mut self) -> Result<()>;
+    fn flush_wal(&mut self) -> impl Future<Output = Result<()>> + Send;
 
     /// Remove all segments <= given segno. Returns function doing that as we
     /// want to perform it without timeline lock.
@@ -325,7 +328,6 @@ impl PhysicalStorage {
     }
 }
 
-#[async_trait::async_trait]
 impl Storage for PhysicalStorage {
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index c2db9de78a..6b31edb1f2 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -83,7 +83,6 @@ impl DiskStateStorage {
     }
 }
 
-#[async_trait::async_trait]
 impl control_file::Storage for DiskStateStorage {
     /// Persist safekeeper state on disk and update internal state.
     async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
@@ -175,7 +174,6 @@ impl DiskWALStorage {
     }
 }
 
-#[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml
index c3bfe2bfd2..e7a4264fd0 100644
--- a/storage_controller/client/Cargo.toml
+++ b/storage_controller/client/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 thiserror.workspace = true
-async-trait.workspace = true
 reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true

From 022fad65eba4a89e5356096aebf4517e46d9416c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 29 Aug 2024 20:16:44 +0100
Subject: [PATCH 061/159] proxy: fix password hash cancellation (#8868)

In #8863 I replaced the threadpool with tokio tasks, but there was a
behaviour I missed regarding cancellation. Adding the JoinHandle wrapper
that triggers abort on drop should fix this.

Another change, any panics that occur in password hashing will be
propagated through the resume_unwind functionality.
---
 proxy/src/scram/exchange.rs   |  3 +--
 proxy/src/scram/threadpool.rs | 40 +++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 7fdadc7038..786cbcaa19 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -86,8 +86,7 @@ async fn derive_client_key(
 ) -> ScramKey {
     let salted_password = pool
         .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
-        .await
-        .expect("job should not be cancelled");
+        .await;
 
     let make_key = |name| {
         let key = Hmac::<Sha256>::new_from_slice(&salted_password)
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index d73a927995..2702aeebfe 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -15,6 +15,7 @@ use std::{
     task::{Context, Poll},
 };
 
+use futures::FutureExt;
 use rand::Rng;
 use rand::{rngs::SmallRng, SeedableRng};
 
@@ -74,15 +75,13 @@ impl ThreadPool {
         })
     }
 
-    pub(crate) fn spawn_job(
-        &self,
-        endpoint: EndpointIdInt,
-        pbkdf2: Pbkdf2,
-    ) -> tokio::task::JoinHandle<[u8; 32]> {
-        self.runtime
-            .as_ref()
-            .unwrap()
-            .spawn(JobSpec { pbkdf2, endpoint })
+    pub(crate) fn spawn_job(&self, endpoint: EndpointIdInt, pbkdf2: Pbkdf2) -> JobHandle {
+        JobHandle(
+            self.runtime
+                .as_ref()
+                .unwrap()
+                .spawn(JobSpec { pbkdf2, endpoint }),
+        )
     }
 }
 
@@ -167,6 +166,26 @@ impl Future for JobSpec {
     }
 }
 
+pub(crate) struct JobHandle(tokio::task::JoinHandle<[u8; 32]>);
+
+impl Future for JobHandle {
+    type Output = [u8; 32];
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        match self.0.poll_unpin(cx) {
+            Poll::Ready(Ok(ok)) => Poll::Ready(ok),
+            Poll::Ready(Err(err)) => std::panic::resume_unwind(err.into_panic()),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl Drop for JobHandle {
+    fn drop(&mut self) {
+        self.0.abort();
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::EndpointId;
@@ -183,8 +202,7 @@ mod tests {
         let salt = [0x55; 32];
         let actual = pool
             .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
-            .await
-            .unwrap();
+            .await;
 
         let expected = [
             10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,

From 72aa6b02dab6a8d0748fa79eac59f10f1d4dc4f1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 30 Aug 2024 11:34:23 +0100
Subject: [PATCH 062/159] chore: speed up testing (#8874)

`safekeeper::random_test test_random_schedules` debug test takes over 2
minutes to run on our arm runners. Running it 6 times with pageserver
settings seems redundant.
---
 .github/workflows/_build-and-test-locally.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index a8526fc6b1..e18e6a1201 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -216,9 +216,13 @@ jobs:
           #nextest does not yet support running doctests
           ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
+          # run all non-pageserver tests
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
+
+          # run pageserver tests with different settings
           for io_engine in std-fs tokio-epoll-uring ; do
             for io_buffer_alignment in 0 1 512 ; do
-              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
             done
           done
 

From 20f82f91698fc64265b18e12cd7482b141e0832c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 30 Aug 2024 11:44:13 +0100
Subject: [PATCH 063/159] storage controller: sleep between compute notify
 retries (#8869)

## Problem

Live migration retries when it fails to notify the compute of the new
location. It should sleep between attempts.

Closes: https://github.com/neondatabase/neon/issues/8820

## Summary of changes

- Do an `exponential_backoff` in the retry loop for compute
notifications
---
 storage_controller/src/reconciler.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 94db879ade..102a3124d2 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -12,6 +12,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
+use utils::backoff::exponential_backoff;
 use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -568,6 +569,7 @@ impl Reconciler {
 
         // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
         // the origin without notifying compute, we will render the tenant unavailable.
+        let mut notify_attempts = 0;
         while let Err(e) = self.compute_notify().await {
             match e {
                 NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
@@ -578,6 +580,17 @@ impl Reconciler {
                     );
                 }
             }
+
+            exponential_backoff(
+                notify_attempts,
+                // Generous waits: control plane operations which might be blocking us usually complete on the order
+                // of hundreds to thousands of milliseconds, so no point busy polling.
+                1.0,
+                10.0,
+                &self.cancel,
+            )
+            .await;
+            notify_attempts += 1;
         }
 
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then

From e58e045ebb80940f8fa05c8c75fdb118978fa14a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 30 Aug 2024 13:18:30 +0100
Subject: [PATCH 064/159] CI(promote-compatibility-data): fix job (#8871)

## Problem

`promote-compatibility-data` job got broken and slightly outdated after
- https://github.com/neondatabase/neon/pull/8552 -- we don't upload
artifacts for ARM64
- https://github.com/neondatabase/neon/pull/8561 -- we don't prepare
`debug` artifacts in the release branch anymore

## Summary of changes
- Promote artifacts from release PRs to the latest version (but do it
from `release` branch)
- Upload artifacts for both X64 and ARM64
---
 .../actions/run-python-test-set/action.yml    |  8 +-
 .github/workflows/build_and_test.yml          | 97 ++++++++++++++-----
 2 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 6c2cee0971..4008cd0d36 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -71,7 +71,7 @@ runs:
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
         path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         prefix: latest
         # The lack of compatibility snapshot (for example, for the new Postgres version)
@@ -211,13 +211,13 @@ runs:
         fi
 
     - name: Upload compatibility snapshot
-      if: github.ref_name == 'release'
+      # Note, that we use `github.base_ref` which is a target branch for a PR
+      if: github.event_name == 'pull_request' && github.base_ref == 'release'
       uses: ./.github/actions/upload
       with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
+        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
         # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
         path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
-        prefix: latest
 
     - name: Upload test results
       if: ${{ !cancelled() }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1e7f3598c2..53d33b420f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1055,43 +1055,88 @@ jobs:
               generate_release_notes: true,
             })
 
+  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
+    needs: [ deploy ]
     if: github.ref_name == 'release'
 
-    runs-on: [ self-hosted, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
+    runs-on: ubuntu-22.04
     steps:
-      - name: Promote compatibility snapshot for the release
+      - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
+        id: fetch-last-release-pr-info
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          branch_name_and_pr_number=$(gh pr list \
+            --repo "${GITHUB_REPOSITORY}" \
+            --base release \
+            --state merged \
+            --limit 10 \
+            --json mergeCommit,headRefName,number \
+            --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
+          branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
+          pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
+
+          run_id=$(gh run list \
+            --repo "${GITHUB_REPOSITORY}" \
+            --workflow build_and_test.yml \
+            --branch "${branch_name}" \
+            --json databaseId \
+            --limit 1 \
+            --jq '.[].databaseId')
+
+          last_commit_sha=$(gh pr view "${pr_number}" \
+            --repo "${GITHUB_REPOSITORY}" \
+            --json commits \
+            --jq '.commits[-1].oid')
+
+          echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
+          echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
+
+      - name: Promote compatibility snapshot and Neon artifact
         env:
           BUCKET: neon-github-public-dev
-          PREFIX: artifacts/latest
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          AWS_REGION: eu-central-1
+          COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
+          RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
         run: |
-          # Update compatibility snapshot for the release
-          for pg_version in v14 v15 v16; do
-            for build_type in debug release; do
-              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
-              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
+          old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
+          new_prefix="artifacts/latest"
 
-              time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          files_to_promote=()
+          files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
+
+          for arch in X64 ARM64; do
+            for build_type in debug release; do
+              neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
+              s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
+              if [ -z "${s3_key}" ]; then
+                echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
+                exit 1
+              fi
+
+              files_to_promote+=("s3://${BUCKET}/${s3_key}")
+
+              for pg_version in v14 v15 v16; do
+                # We run less tests for debug builds, so we don't need to promote them
+                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
+                  continue
+                fi
+
+                compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
+                s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
+                if [ -z "${s3_key}" ]; then
+                  echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
+                  exit 1
+                fi
+
+                files_to_promote+=("s3://${BUCKET}/${s3_key}")
+              done
             done
           done
 
-          # Update Neon artifact for the release (reuse already uploaded artifact)
-          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
-            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
-
-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
-            if [ -z "${S3_KEY}" ]; then
-              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
-              exit 1
-            fi
-
-            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
+          for f in "${files_to_promote[@]}"; do
+            time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
           done
 
   pin-build-tools-image:

From df971f995c3d1ab864426190382e297654938500 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 31 Aug 2024 02:12:39 +0800
Subject: [PATCH 065/159] feat(storage-scrubber): check layer map validity
 (#8867)

When implementing bottom-most gc-compaction, we analyzed the structure
of layer maps that the current compaction algorithm could produce, and
decided to only support structures without delta layer overlaps and LSN
intersections with the exception of single key layers.

## Summary of changes

This patch adds the layer map valid check in the storage scrubber.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs | 59 +++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 08b0f06ebf..15dfb101b5 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,6 +1,7 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 
 use anyhow::Context;
+use itertools::Itertools;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
@@ -47,6 +48,56 @@ impl TimelineAnalysis {
     }
 }
 
+/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+///
+/// ```plain
+/// |       |                 |       |
+/// |   1   |    |   2   |    |   3   |
+/// |       |    |       |    |       |
+/// ```
+///
+/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
+/// the same LSN range.
+///
+/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
+///
+/// ```plain
+/// |       |    |   2   |    |       |
+/// |   1   |    |-------|    |   3   |
+/// |       |    |   4   |    |       |
+///
+/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+fn check_valid_layermap(metadata: &HashMap<LayerName, LayerFileMetadata>) -> Option<String> {
+    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+    let mut all_delta_layers = Vec::new();
+    for (name, _) in metadata.iter() {
+        if let LayerName::Delta(layer) = name {
+            if layer.key_range.start.next() != layer.key_range.end {
+                all_delta_layers.push(layer.clone());
+            }
+        }
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = &layer.lsn_range;
+        lsn_split_point.insert(lsn_range.start);
+        lsn_split_point.insert(lsn_range.end);
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = layer.lsn_range.clone();
+        let intersects = lsn_split_point.range(lsn_range).collect_vec();
+        if intersects.len() > 1 {
+            let err = format!(
+                        "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                        layer,
+                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+                    );
+            return Some(err);
+        }
+    }
+    None
+}
+
 pub(crate) async fn branch_cleanup_and_check_errors(
     remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
@@ -126,6 +177,12 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                         }
                     }
 
+                    if let Some(err) = check_valid_layermap(&index_part.layer_metadata) {
+                        result.errors.push(format!(
+                            "index_part.json contains invalid layer map structure: {err}"
+                        ));
+                    }
+
                     for (layer, metadata) in index_part.layer_metadata {
                         if metadata.file_size == 0 {
                             result.errors.push(format!(

From cacb1ae3331873f2b34c56b03596caabad830f14 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Fri, 30 Aug 2024 14:53:52 -0400
Subject: [PATCH 066/159] pageserver: set default io_buffer_alignment to 512
 bytes (#8878)

## Summary of changes

- Setting default io_buffer_alignment to 512 bytes.
- Fix places that assumed `DEFAULT_IO_BUFFER_ALIGNMENT=0`
- Adapt unit tests to handle merge with `chunk size <= 4096`.

## Testing and Performance

We have done sufficient performance de-risking.

Enabling it by default completes our correctness de-risking before the
next release.

Context: https://neondb.slack.com/archives/C07BZ38E6SD/p1725026845455259

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/config.rs                      |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  4 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 59 ++++++++++---------
 pageserver/src/virtual_file.rs                | 18 ++----
 5 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 994075bef6..9e4530ba3c 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -96,7 +96,7 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 0;
+    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
 
     ///
     /// Default built-in configuration file.
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 00ef5b0afd..885eb13b29 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -2283,7 +2283,7 @@ pub(crate) mod test {
             .await
             .unwrap();
         let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
-        for max_read_size in [1, 2048] {
+        for max_read_size in [1, 1024] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
@@ -2297,7 +2297,7 @@ pub(crate) mod test {
                         // every key should be a batch b/c the value is larger than max_read_size
                         assert_eq!(iter.key_values_batch.len(), 1);
                     } else {
-                        assert_eq!(iter.key_values_batch.len(), batch_size);
+                        assert!(iter.key_values_batch.len() <= batch_size);
                     }
                     if num_items >= N {
                         break;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 38411e9d9e..4c22541e02 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1367,7 +1367,7 @@ mod test {
                 .await
                 .unwrap();
         let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
-        for max_read_size in [1, 2048] {
+        for max_read_size in [1, 1024] {
             for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                 println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                 // Test if the batch size is correctly determined
@@ -1381,7 +1381,7 @@ mod test {
                         // every key should be a batch b/c the value is larger than max_read_size
                         assert_eq!(iter.key_values_batch.len(), 1);
                     } else {
-                        assert_eq!(iter.key_values_batch.len(), batch_size);
+                        assert!(iter.key_values_batch.len() <= batch_size);
                     }
                     if num_items >= N {
                         break;
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 80bc56092d..146bcf0e35 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -25,7 +25,6 @@ use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::{self, VirtualFile};
@@ -91,7 +90,7 @@ impl VectoredReadCoalesceMode {
     /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
     pub(crate) fn get() -> Self {
         let align = virtual_file::get_io_buffer_alignment_raw();
-        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
+        if align == 0 {
             VectoredReadCoalesceMode::AdjacentOnly
         } else {
             VectoredReadCoalesceMode::Chunked(align)
@@ -735,27 +734,32 @@ mod tests {
     fn planner_chunked_coalesce_all_test() {
         use crate::virtual_file;
 
-        const CHUNK_SIZE: u64 = 512;
-        virtual_file::set_io_buffer_alignment(CHUNK_SIZE as usize).unwrap();
-        let max_read_size = CHUNK_SIZE as usize * 8;
+        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
+
+        // The test explicitly does not check chunk size < 512
+        if chunk_size < 512 {
+            return;
+        }
+
+        let max_read_size = chunk_size as usize * 8;
         let key = Key::MIN;
         let lsn = Lsn(0);
 
         let blob_descriptions = [
-            (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
-            (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
-            (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
-            (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
-            (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
-            (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
-            (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
+            (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN
+            (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap
+            (key, lsn, chunk_size / 2, BlobFlag::None),
+            (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap
+            (key, lsn, chunk_size, BlobFlag::None),
+            (key, lsn, chunk_size * 2 - 1, BlobFlag::None),
+            (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap
+            (key, lsn, chunk_size * 3 + 1, BlobFlag::None),
+            (key, lsn, chunk_size * 5 + 1, BlobFlag::None),
+            (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
+            (key, lsn, chunk_size * 7 + 1, BlobFlag::None),
+            (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
+            (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk
+            (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
         ];
 
         let ranges = [
@@ -834,18 +838,19 @@ mod tests {
 
     #[test]
     fn planner_replacement_test() {
-        let max_read_size = 128 * 1024;
+        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
+        let max_read_size = 128 * chunk_size as usize;
         let first_key = Key::MIN;
         let second_key = first_key.next();
         let lsn = Lsn(0);
 
         let blob_descriptions = vec![
-            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
-            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
-            (second_key, lsn, 3 * 1024, BlobFlag::None),
-            (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
-            (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
+            (first_key, lsn, 0, BlobFlag::None),          // First in read 1
+            (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll),
+            (second_key, lsn, 3 * chunk_size, BlobFlag::None),
+            (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * chunk_size, BlobFlag::None),       // Last in read 2
         ];
 
         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
@@ -855,7 +860,7 @@ mod tests {
             planner.handle(key, lsn, offset, flag);
         }
 
-        planner.handle_range_end(6 * 1024);
+        planner.handle_range_end(6 * chunk_size);
 
         let reads = planner.finish();
         assert_eq!(reads.len(), 2);
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 4b11dc1a94..97d966e2da 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1196,15 +1196,11 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
 
     if cfg!(test) {
         let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
-        if align == DEFAULT_IO_BUFFER_ALIGNMENT {
-            if let Some(test_align) = utils::env::var(env_var_name) {
-                if is_zero_or_power_of_two(test_align) {
-                    test_align
-                } else {
-                    panic!("IO buffer alignment ({test_align}) is not a power of two");
-                }
+        if let Some(test_align) = utils::env::var(env_var_name) {
+            if is_zero_or_power_of_two(test_align) {
+                test_align
             } else {
-                crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT
+                panic!("IO buffer alignment ({test_align}) is not a power of two");
             }
         } else {
             align
@@ -1219,11 +1215,7 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
 /// This function should be used for getting the actual alignment value to use.
 pub(crate) fn get_io_buffer_alignment() -> usize {
     let align = get_io_buffer_alignment_raw();
-    if align == DEFAULT_IO_BUFFER_ALIGNMENT {
-        1
-    } else {
-        align
-    }
+    align.max(1)
 }
 
 #[cfg(test)]

From 05caaab8504093f708c81fd01454c8da45a4901d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 31 Aug 2024 05:22:26 +0800
Subject: [PATCH 067/159] fix(pageserver): fire layer eviction alert only when
 it's visible (#8882)

The pull request https://github.com/neondatabase/neon/pull/8679
explicitly mentioned that it will evict layers earlier than before.
Given that the eviction metrics is solely based on eviction threshold
(which is 86400s now), we should consider the early eviction and do not
fire alert if it's a covered layer.

## Summary of changes

Record eviction timer only when the layer is visible + accessed.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/layer.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 53bb66b95e..86a200ce28 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1494,8 +1494,9 @@ impl LayerInner {
                 let duration = SystemTime::now().duration_since(local_layer_mtime);
                 match duration {
                     Ok(elapsed) => {
-                        let accessed = self.access_stats.accessed();
-                        if accessed {
+                        let accessed_and_visible = self.access_stats.accessed()
+                            && self.access_stats.visibility() == LayerVisibilityHint::Visible;
+                        if accessed_and_visible {
                             // Only layers used for reads contribute to our "low residence" metric that is used
                             // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
                             // to be rapidly evicted without contributing to this metric.
@@ -1509,7 +1510,7 @@ impl LayerInner {
 
                         tracing::info!(
                             residence_millis = elapsed.as_millis(),
-                            accessed,
+                            accessed_and_visible,
                             "evicted layer after known residence period"
                         );
                     }

From 3ec785f30d248739daba93d10353187ca733da0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 31 Aug 2024 01:12:25 +0200
Subject: [PATCH 068/159] Add safekeeper scrubber test (#8785)

The test is very rudimentary, it only checks that before and after
tenant deletion, we can run `scan_metadata` for the safekeeper node
kind. Also, we don't actually expect any uploaded data, for that we
don't have enough WAL (needs to create at least one S3-uploaded file,
the scrubber doesn't recognize partial files yet).

The `scan_metadata` scrubber subcommand is extended to support either
specifying a database connection string, which was previously the only
way, and required a database to be present, or specifying the timeline
information manually via json. This is ideal for testing scenarios
because in those, the number of timelines is usually limited,
but it is involved to spin up a database just to write the timeline
information.
---
 storage_scrubber/src/main.rs                  |  32 ++--
 .../src/scan_safekeeper_metadata.rs           | 156 ++++++++++++------
 test_runner/fixtures/neon_fixtures.py         |  46 +++++-
 test_runner/regress/test_tenant_delete.py     |  52 +++++-
 4 files changed, 213 insertions(+), 73 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 3935e513e3..c5961753c5 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::{anyhow, bail};
+use anyhow::{anyhow, bail, Context};
 use camino::Utf8PathBuf;
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
@@ -7,6 +7,7 @@ use storage_controller_client::control_api;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
+use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
 use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
@@ -76,6 +77,9 @@ enum Command {
         /// For safekeeper node_kind only, table in the db with debug dump
         #[arg(long, default_value = None)]
         dump_db_table: Option<String>,
+        /// For safekeeper node_kind only, json list of timelines and their lsn info
+        #[arg(long, default_value = None)]
+        timeline_lsns: Option<String>,
     },
     TenantSnapshot {
         #[arg(long = "tenant-id")]
@@ -155,20 +159,22 @@ async fn main() -> anyhow::Result<()> {
             post_to_storcon,
             dump_db_connstr,
             dump_db_table,
+            timeline_lsns,
         } => {
             if let NodeKind::Safekeeper = node_kind {
-                let dump_db_connstr =
-                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
-                let dump_db_table =
-                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
-
-                let summary = scan_safekeeper_metadata(
-                    bucket_config.clone(),
-                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
-                    dump_db_connstr,
-                    dump_db_table,
-                )
-                .await?;
+                let db_or_list = match (timeline_lsns, dump_db_connstr) {
+                    (Some(timeline_lsns), _) => {
+                        let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?;
+                        DatabaseOrList::List(timeline_lsns)
+                    }
+                    (None, Some(dump_db_connstr)) => {
+                        let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?;
+                        let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect();
+                        DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table }
+                    }
+                    (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"),
+                };
+                let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?;
                 if json {
                     println!("{}", serde_json::to_string(&summary).unwrap())
                 } else {
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 1a9f3d0ef5..15f3665fac 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -7,7 +7,7 @@ use postgres_ffi::{XLogFileName, PG_TLI};
 use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
-use tracing::{error, info, trace};
+use tracing::{debug, error, info};
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -54,6 +54,23 @@ impl MetadataSummary {
     }
 }
 
+#[derive(serde::Deserialize)]
+pub struct TimelineLsnData {
+    tenant_id: String,
+    timeline_id: String,
+    timeline_start_lsn: Lsn,
+    backup_lsn: Lsn,
+}
+
+pub enum DatabaseOrList {
+    Database {
+        tenant_ids: Vec<TenantId>,
+        connstr: String,
+        table: String,
+    },
+    List(Vec<TimelineLsnData>),
+}
+
 /// Scan the safekeeper metadata in an S3 bucket, reporting errors and
 /// statistics.
 ///
@@ -63,68 +80,39 @@ impl MetadataSummary {
 /// the project wasn't deleted in the meanwhile.
 pub async fn scan_safekeeper_metadata(
     bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantId>,
-    dump_db_connstr: String,
-    dump_db_table: String,
+    db_or_list: DatabaseOrList,
 ) -> anyhow::Result<MetadataSummary> {
     info!(
-        "checking bucket {}, region {}, dump_db_table {}",
-        bucket_config.bucket, bucket_config.region, dump_db_table
+        "checking bucket {}, region {}",
+        bucket_config.bucket, bucket_config.region
     );
-    // Use rustls (Neon requires TLS)
-    let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-    let client_config = rustls::ClientConfig::builder()
-        .with_root_certificates(root_store)
-        .with_no_client_auth();
-    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
-    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
-    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    let tenant_filter_clause = if !tenant_ids.is_empty() {
-        format!(
-            "and tenant_id in ({})",
-            tenant_ids
-                .iter()
-                .map(|t| format!("'{}'", t))
-                .collect::<Vec<_>>()
-                .join(", ")
-        )
-    } else {
-        "".to_owned()
-    };
-    let query = format!(
-        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
-        dump_db_table, tenant_filter_clause,
-    );
-    info!("query is {}", query);
-    let timelines = client.query(&query, &[]).await?;
-    info!("loaded {} timelines", timelines.len());
 
     let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
-    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
-        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
-        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
-        let timeline_start_lsn_pg: PgLsn = row.get(2);
-        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
-        let backup_lsn_pg: PgLsn = row.get(3);
-        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
+    let timelines = match db_or_list {
+        DatabaseOrList::Database {
+            tenant_ids,
+            connstr,
+            table,
+        } => load_timelines_from_db(tenant_ids, connstr, table).await?,
+        DatabaseOrList::List(list) => list,
+    };
+    info!("loaded {} timelines", timelines.len());
+
+    let checks = futures::stream::iter(timelines.into_iter().map(Ok)).map_ok(|timeline| {
+        let tenant_id = TenantId::from_str(&timeline.tenant_id).expect("failed to parse tenant_id");
+        let timeline_id =
+            TimelineId::from_str(&timeline.timeline_id).expect("failed to parse tenant_id");
         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
         check_timeline(
             &remote_client,
             &target,
             &cloud_admin_api_client,
             ttid,
-            timeline_start_lsn,
-            backup_lsn,
+            timeline.timeline_start_lsn,
+            timeline.backup_lsn,
         )
     });
     // Run multiple check_timeline's concurrently.
@@ -163,11 +151,9 @@ async fn check_timeline(
     timeline_start_lsn: Lsn,
     backup_lsn: Lsn,
 ) -> anyhow::Result<TimelineCheckResult> {
-    trace!(
+    debug!(
         "checking ttid {}, should contain WAL [{}-{}]",
-        ttid,
-        timeline_start_lsn,
-        backup_lsn
+        ttid, timeline_start_lsn, backup_lsn
     );
     // calculate expected segfiles
     let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
@@ -177,7 +163,7 @@ async fn check_timeline(
             .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
     );
     let expected_files_num = expected_segfiles.len();
-    trace!("expecting {} files", expected_segfiles.len(),);
+    debug!("expecting {} files", expected_segfiles.len(),);
 
     // now list s3 and check if it misses something
     let ttshid =
@@ -252,3 +238,65 @@ fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
+
+async fn load_timelines_from_db(
+    tenant_ids: Vec<TenantId>,
+    dump_db_connstr: String,
+    dump_db_table: String,
+) -> anyhow::Result<Vec<TimelineLsnData>> {
+    info!("loading from table {dump_db_table}");
+
+    // Use rustls (Neon requires TLS)
+    let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
+    let client_config = rustls::ClientConfig::builder()
+        .with_root_certificates(root_store)
+        .with_no_client_auth();
+    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let tenant_filter_clause = if !tenant_ids.is_empty() {
+        format!(
+            "and tenant_id in ({})",
+            tenant_ids
+                .iter()
+                .map(|t| format!("'{}'", t))
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    } else {
+        "".to_owned()
+    };
+    let query = format!(
+        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) \
+        from \"{dump_db_table}\" \
+        where not is_cancelled {tenant_filter_clause} \
+        group by tenant_id, timeline_id;"
+    );
+    info!("query is {}", query);
+    let timelines = client.query(&query, &[]).await?;
+
+    let timelines = timelines
+        .into_iter()
+        .map(|row| {
+            let tenant_id = row.get(0);
+            let timeline_id = row.get(1);
+            let timeline_start_lsn_pg: PgLsn = row.get(2);
+            let backup_lsn_pg: PgLsn = row.get(3);
+
+            TimelineLsnData {
+                tenant_id,
+                timeline_id,
+                timeline_start_lsn: Lsn(u64::from(timeline_start_lsn_pg)),
+                backup_lsn: Lsn(u64::from(backup_lsn_pg)),
+            }
+        })
+        .collect::<Vec<TimelineLsnData>>();
+    Ok(timelines)
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 69a4234617..800ae03d13 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4625,12 +4625,20 @@ class Safekeeper(LogUtils):
         wait_until(20, 0.5, paused)
 
 
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class NodeKind(str, Enum):
+    PAGESERVER = "pageserver"
+    SAFEKEEPER = "safekeeper"
+
+
 class StorageScrubber:
     def __init__(self, env: NeonEnv, log_dir: Path):
         self.env = env
         self.log_dir = log_dir
 
-    def scrubber_cli(self, args: list[str], timeout) -> str:
+    def scrubber_cli(
+        self, args: list[str], timeout, extra_env: Optional[Dict[str, str]] = None
+    ) -> str:
         assert isinstance(self.env.pageserver_remote_storage, S3Storage)
         s3_storage = self.env.pageserver_remote_storage
 
@@ -4645,6 +4653,9 @@ class StorageScrubber:
         if s3_storage.endpoint is not None:
             env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
 
+        if extra_env is not None:
+            env.update(extra_env)
+
         base_args = [
             str(self.env.neon_binpath / "storage_scrubber"),
             f"--controller-api={self.env.storage_controller.api_root()}",
@@ -4672,18 +4683,43 @@ class StorageScrubber:
         assert stdout is not None
         return stdout
 
-    def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
+    def scan_metadata_safekeeper(
+        self,
+        timeline_lsns: List[Dict[str, Any]],
+        cloud_admin_api_url: str,
+        cloud_admin_api_token: str,
+    ) -> Tuple[bool, Any]:
+        extra_env = {
+            "CLOUD_ADMIN_API_URL": cloud_admin_api_url,
+            "CLOUD_ADMIN_API_TOKEN": cloud_admin_api_token,
+        }
+        return self.scan_metadata(
+            node_kind=NodeKind.SAFEKEEPER, timeline_lsns=timeline_lsns, extra_env=extra_env
+        )
+
+    def scan_metadata(
+        self,
+        post_to_storage_controller: bool = False,
+        node_kind: NodeKind = NodeKind.PAGESERVER,
+        timeline_lsns: Optional[List[Dict[str, Any]]] = None,
+        extra_env: Optional[Dict[str, str]] = None,
+    ) -> Tuple[bool, Any]:
         """
         Returns the health status and the metadata summary.
         """
-        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
+        args = ["scan-metadata", "--node-kind", node_kind.value, "--json"]
         if post_to_storage_controller:
             args.append("--post")
-        stdout = self.scrubber_cli(args, timeout=30)
+        if timeline_lsns is not None:
+            args.append("--timeline-lsns")
+            args.append(json.dumps(timeline_lsns))
+        stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)
 
         try:
             summary = json.loads(stdout)
-            healthy = not summary["with_errors"] and not summary["with_warnings"]
+            # summary does not contain "with_warnings" if node_kind is the safekeeper
+            no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
+            healthy = not summary["with_errors"] and no_warnings
             return healthy, summary
         except:
             log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 448a28dc31..7ee949e8d3 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,7 +1,9 @@
+import json
 from threading import Thread
 
 import pytest
 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -17,6 +19,8 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 
 def error_tolerant_delete(ps_http, tenant_id):
@@ -322,7 +326,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     env.pageserver.stop()
 
 
-def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
+def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder: NeonEnvBuilder):
     """
     Validate that creating and then deleting the tenant both survives the scrubber,
     and that one can run the scrubber without problems.
@@ -347,6 +351,45 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
     healthy, _ = env.storage_scrubber.scan_metadata()
     assert healthy
 
+    timeline_lsns = {
+        "tenant_id": f"{tenant_id}",
+        "timeline_id": f"{timeline_id}",
+        "timeline_start_lsn": f"{last_flush_lsn}",
+        "backup_lsn": f"{last_flush_lsn}",
+    }
+
+    cloud_admin_url = f"http://{make_httpserver.host}:{make_httpserver.port}/"
+    cloud_admin_token = ""
+
+    def get_branches(request: Request):
+        # Compare definition with `BranchData` struct
+        dummy_data = {
+            "id": "test-branch-id",
+            "created_at": "",  # TODO
+            "updated_at": "",  # TODO
+            "name": "testbranchname",
+            "project_id": "test-project-id",
+            "timeline_id": f"{timeline_id}",
+            "default": False,
+            "deleted": False,
+            "logical_size": 42000,
+            "physical_size": 42000,
+            "written_size": 42000,
+        }
+        # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions),
+        # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute.
+        log.info(f"got get_branches request: {request.json}")
+        return Response(json.dumps(dummy_data), content_type="application/json", status=200)
+
+    make_httpserver.expect_request("/branches", method="GET").respond_with_handler(get_branches)
+
+    healthy, _ = env.storage_scrubber.scan_metadata_safekeeper(
+        timeline_lsns=[timeline_lsns],
+        cloud_admin_api_url=cloud_admin_url,
+        cloud_admin_api_token=cloud_admin_token,
+    )
+    assert healthy
+
     env.start()
     ps_http = env.pageserver.http_client()
     ps_http.tenant_delete(tenant_id)
@@ -354,3 +397,10 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     healthy, _ = env.storage_scrubber.scan_metadata()
     assert healthy
+
+    healthy, _ = env.storage_scrubber.scan_metadata_safekeeper(
+        timeline_lsns=[timeline_lsns],
+        cloud_admin_api_url=cloud_admin_url,
+        cloud_admin_api_token=cloud_admin_token,
+    )
+    assert healthy

From 516ac0591e762142ca0ce85f212192c5af59a097 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 2 Sep 2024 12:36:57 +0100
Subject: [PATCH 069/159] storage controller: eliminate ensure_attached (#8875)

## Problem

This is a followup to #8783

- The old blocking ensure_attached function had been retained to handle
the case where a shard had a None generation_pageserver, but this wasn't
really necessary.
- There was a subtle `.1` in the code where a struct would have been
clearer

Closes #8819

## Summary of changes

- Add ShardGenerationState to represent the results of peek_generation
- Instead of calling ensure_attached when a tenant has a non-attached
shard, check the shard's policy and return 409 if it isn't Attached,
else return 503 if the shard's policy is attached but it hasn't been
reconciled yet (i.e. has a None generation_pageserver)
---
 storage_controller/src/persistence.rs |  22 +++--
 storage_controller/src/service.rs     | 124 +++++++++-----------------
 2 files changed, 58 insertions(+), 88 deletions(-)

diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index a842079ce7..6e1c2016ff 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -122,6 +122,13 @@ pub(crate) enum TenantFilter {
     Shard(TenantShardId),
 }
 
+/// Represents the results of looking up generation+pageserver for the shards of a tenant
+pub(crate) struct ShardGenerationState {
+    pub(crate) tenant_shard_id: TenantShardId,
+    pub(crate) generation: Option<Generation>,
+    pub(crate) generation_pageserver: Option<NodeId>,
+}
+
 impl Persistence {
     // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
     // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -540,7 +547,7 @@ impl Persistence {
     pub(crate) async fn peek_generations(
         &self,
         filter_tenant_id: TenantId,
-    ) -> Result<Vec<(TenantShardId, Option<Generation>, Option<NodeId>)>, DatabaseError> {
+    ) -> Result<Vec<ShardGenerationState>, DatabaseError> {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
             .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
@@ -555,13 +562,12 @@ impl Persistence {
 
         Ok(rows
             .into_iter()
-            .map(|p| {
-                (
-                    p.get_tenant_shard_id()
-                        .expect("Corrupt tenant shard id in database"),
-                    p.generation.map(|g| Generation::new(g as u32)),
-                    p.generation_pageserver.map(|n| NodeId(n as u64)),
-                )
+            .map(|p| ShardGenerationState {
+                tenant_shard_id: p
+                    .get_tenant_shard_id()
+                    .expect("Corrupt tenant shard id in database"),
+                generation: p.generation.map(|g| Generation::new(g as u32)),
+                generation_pageserver: p.generation_pageserver.map(|n| NodeId(n as u64)),
             })
             .collect())
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 1f221a9b45..78627953d0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -22,7 +22,7 @@ use crate::{
     peer_client::GlobalObservedState,
     persistence::{
         AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
-        TenantFilter,
+        ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -3106,20 +3106,44 @@ impl Service {
             // will still be the latest when we're done: we will check generations again at the end of
             // this function to handle that.
             let generations = self.persistence.peek_generations(tenant_id).await?;
-            let generations = if generations.iter().any(|i| i.1.is_none()) {
-                // One or more shards is not attached to anything: maybe this is a new tenant?  Wait for
-                // it to reconcile.
-                self.ensure_attached_wait(tenant_id).await?;
-                self.persistence.peek_generations(tenant_id).await?
-            } else {
-                generations
-            };
+
+            if generations
+                .iter()
+                .any(|i| i.generation.is_none() || i.generation_pageserver.is_none())
+            {
+                // One or more shards has not been attached to a pageserver.  Check if this is because it's configured
+                // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry)
+                let locked = self.inner.read().unwrap();
+                for (shard_id, shard) in
+                    locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+                {
+                    match shard.policy {
+                        PlacementPolicy::Attached(_) => {
+                            // This shard is meant to be attached: the caller is not wrong to try and
+                            // use this function, but we can't service the request right now.
+                        }
+                        PlacementPolicy::Secondary | PlacementPolicy::Detached => {
+                            return Err(ApiError::Conflict(format!(
+                                "Shard {shard_id} tenant has policy {:?}",
+                                shard.policy
+                            )));
+                        }
+                    }
+                }
+
+                return Err(ApiError::ResourceUnavailable(
+                    "One or more shards in tenant is not yet attached".into(),
+                ));
+            }
 
             let locked = self.inner.read().unwrap();
-            for (tenant_shard_id, generation, generation_pageserver) in generations {
-                let node_id = generation_pageserver.ok_or(ApiError::Conflict(
-                    "Tenant not currently attached".to_string(),
-                ))?;
+            for ShardGenerationState {
+                tenant_shard_id,
+                generation,
+                generation_pageserver,
+            } in generations
+            {
+                let node_id = generation_pageserver.expect("We checked for None above");
                 let node = locked
                     .nodes
                     .get(&node_id)
@@ -3141,7 +3165,13 @@ impl Service {
             let latest_generations = self.persistence.peek_generations(tenant_id).await?;
             if latest_generations
                 .into_iter()
-                .map(|g| (g.0, g.1))
+                .map(
+                    |ShardGenerationState {
+                         tenant_shard_id,
+                         generation,
+                         generation_pageserver: _,
+                     }| (tenant_shard_id, generation),
+                )
                 .collect::<Vec<_>>()
                 != target_gens
                     .into_iter()
@@ -5280,72 +5310,6 @@ impl Service {
         ))
     }
 
-    /// Helper for methods that will try and call pageserver APIs for
-    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
-    /// is attached somewhere.
-    fn ensure_attached_schedule(
-        &self,
-        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
-        tenant_id: TenantId,
-    ) -> Result<Vec<ReconcilerWaiter>, anyhow::Error> {
-        let mut waiters = Vec::new();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-
-        let mut schedule_context = ScheduleContext::default();
-        for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            shard.schedule(scheduler, &mut schedule_context)?;
-
-            // The shard's policies may not result in an attached location being scheduled: this
-            // is an error because our caller needs it attached somewhere.
-            if shard.intent.get_attached().is_none() {
-                return Err(anyhow::anyhow!(
-                    "Tenant {tenant_id} not scheduled to be attached"
-                ));
-            };
-
-            if shard.stably_attached().is_some() {
-                // We do not require the shard to be totally up to date on reconciliation: we just require
-                // that it has been attached on the intended node.   Other dirty state such as unattached secondary
-                // locations, or compute hook notifications can be ignored.
-                continue;
-            }
-
-            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
-                tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
-                waiters.push(waiter);
-            }
-        }
-        Ok(waiters)
-    }
-
-    async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
-        let ensure_waiters = {
-            let locked = self.inner.write().unwrap();
-
-            // Check if the tenant is splitting: in this case, even if it is attached,
-            // we must act as if it is not: this blocks e.g. timeline creation/deletion
-            // operations during the split.
-            for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
-                if !matches!(shard.splitting, SplitState::Idle) {
-                    return Err(ApiError::ResourceUnavailable(
-                        "Tenant shards are currently splitting".into(),
-                    ));
-                }
-            }
-
-            self.ensure_attached_schedule(locked, tenant_id)
-                .map_err(ApiError::InternalServerError)?
-        };
-
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
-        for waiter in ensure_waiters {
-            let timeout = deadline.duration_since(Instant::now());
-            waiter.wait_timeout(timeout).await?;
-        }
-
-        Ok(())
-    }
-
     /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler
     /// configuration
     fn maybe_reconcile_shard(

From 9746b6ea312a15b2d607d5a124ca3899ec953d06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 2 Sep 2024 13:51:45 +0200
Subject: [PATCH 070/159] Implement archival_config timeline endpoint in the
 storage controller (#8680)

Implement the timeline specific `archival_config` endpoint also in the
storage controller.

It's mostly a copy-paste of the detach handler: the task is the same: do
the same operation on all shards.

Part of #8088.
---
 pageserver/client/src/mgmt_api.rs            | 18 +++++
 storage_controller/src/http.rs               | 30 ++++++-
 storage_controller/src/pageserver_client.rs  | 20 ++++-
 storage_controller/src/service.rs            | 73 ++++++++++++++++-
 test_runner/regress/test_timeline_archive.py | 83 +++++++++-----------
 5 files changed, 174 insertions(+), 50 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 71d36f3113..737cb00835 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -419,6 +419,24 @@ impl Client {
         }
     }
 
+    pub async fn timeline_archival_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        req: &TimelineArchivalConfigRequest,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn timeline_detach_ancestor(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 207bd5a1e6..d3eb081be4 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -17,7 +17,7 @@ use pageserver_api::controller_api::{
 };
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TenantTimeTravelRequest, TimelineCreateRequest,
+    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -334,6 +334,24 @@ async fn handle_tenant_timeline_delete(
     .await
 }
 
+async fn handle_tenant_timeline_archival_config(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    let create_req = json_request::<TimelineArchivalConfigRequest>(&mut req).await?;
+
+    service
+        .tenant_timeline_archival_config(tenant_id, timeline_id, create_req)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_timeline_detach_ancestor(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1160,6 +1178,16 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_archival_config,
+                    RequestName("v1_tenant_timeline_archival_config"),
+                )
+            },
+        )
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor",
             |r| {
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 8d64201cd9..20770ed703 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -2,8 +2,8 @@ use pageserver_api::{
     models::{
         detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
         PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
-        TopTenantShardsRequest, TopTenantShardsResponse,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest,
+        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -227,6 +227,22 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_archival_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        req: &TimelineArchivalConfigRequest,
+    ) -> Result<()> {
+        measured_request!(
+            "timeline_archival_config",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_archival_config(tenant_shard_id, timeline_id, req)
+                .await
+        )
+    }
+
     pub(crate) async fn timeline_detach_ancestor(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 78627953d0..95821827e2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -46,7 +46,10 @@ use pageserver_api::{
         TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
         TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
-    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
+    models::{
+        SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
+        TopTenantShardsRequest,
+    },
 };
 use reqwest::StatusCode;
 use tracing::{instrument, Instrument};
@@ -131,6 +134,7 @@ enum TenantOperations {
     TimelineCreate,
     TimelineDelete,
     AttachHook,
+    TimelineArchivalConfig,
     TimelineDetachAncestor,
 }
 
@@ -2918,6 +2922,73 @@ impl Service {
         .await?
     }
 
+    pub(crate) async fn tenant_timeline_archival_config(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: TimelineArchivalConfigRequest,
+    ) -> Result<(), ApiError> {
+        tracing::info!(
+            "Setting archival config of timeline {tenant_id}/{timeline_id} to '{:?}'",
+            req.state
+        );
+
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineArchivalConfig,
+        )
+        .await;
+
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            }
+            async fn config_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+                req: TimelineArchivalConfigRequest,
+            ) -> Result<(), ApiError> {
+                tracing::info!(
+                    "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
+
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
+                client
+                    .timeline_archival_config(tenant_shard_id, timeline_id, &req)
+                    .await
+                    .map_err(|e| match e {
+                        mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg) => {
+                            ApiError::PreconditionFailed(msg.into_boxed_str())
+                        }
+                        _ => passthrough_api_error(&node, e),
+                    })
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            // TODO: it would be great to ensure that all shards return the same error
+            let results = self
+                .tenant_for_shards(targets, |tenant_shard_id, node| {
+                    futures::FutureExt::boxed(config_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
+                        req.clone(),
+                    ))
+                })
+                .await?;
+            assert!(!results.is_empty(), "must have at least one result");
+
+            Ok(())
+        }).await?
+    }
+
     pub(crate) async fn tenant_timeline_detach_ancestor(
         &self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 7f158ad251..de43e51c9e 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -1,97 +1,90 @@
 import pytest
 from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
 from fixtures.neon_fixtures import (
-    NeonEnv,
+    NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverApiException
 
 
-def test_timeline_archive(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+@pytest.mark.parametrize("shard_count", [0, 4])
+def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
+    unsharded = shard_count == 0
+    if unsharded:
+        env = neon_env_builder.init_start()
+        # If we run the unsharded version, talk to the pageserver directly
+        ps_http = env.pageserver.http_client()
+    else:
+        neon_env_builder.num_pageservers = shard_count
+        env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+        # If we run the unsharded version, talk to the storage controller
+        ps_http = env.storage_controller.pageserver_api()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*Timeline .* was not found.*",
-            ".*timeline not found.*",
-            ".*Cannot archive timeline which has unarchived child timelines.*",
-            ".*Precondition failed: Requested tenant is missing.*",
-        ]
-    )
-
-    ps_http = env.pageserver.http_client()
-
-    # first try to archive non existing timeline
-    # for existing tenant:
+    # first try to archive a non existing timeline for an existing tenant:
     invalid_timeline_id = TimelineId.generate()
     with pytest.raises(PageserverApiException, match="timeline not found") as exc:
         ps_http.timeline_archival_config(
-            tenant_id=env.initial_tenant,
-            timeline_id=invalid_timeline_id,
+            env.initial_tenant,
+            invalid_timeline_id,
             state=TimelineArchivalState.ARCHIVED,
         )
 
     assert exc.value.status_code == 404
 
-    # for non existing tenant:
+    # for a non existing tenant:
     invalid_tenant_id = TenantId.generate()
     with pytest.raises(
         PageserverApiException,
-        match=f"NotFound: tenant {invalid_tenant_id}",
+        match="NotFound: [tT]enant",
     ) as exc:
         ps_http.timeline_archival_config(
-            tenant_id=invalid_tenant_id,
-            timeline_id=invalid_timeline_id,
+            invalid_tenant_id,
+            invalid_timeline_id,
             state=TimelineArchivalState.ARCHIVED,
         )
 
     assert exc.value.status_code == 404
 
-    # construct pair of branches to validate that pageserver prohibits
+    # construct a pair of branches to validate that pageserver prohibits
     # archival of ancestor timelines when they have non-archived child branches
-    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty")
+    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent")
 
     leaf_timeline_id = env.neon_cli.create_branch(
         "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent"
     )
 
-    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
-
     with pytest.raises(
         PageserverApiException,
         match="Cannot archive timeline which has non-archived child timelines",
     ) as exc:
-        assert timeline_path.exists()
-
         ps_http.timeline_archival_config(
-            tenant_id=env.initial_tenant,
-            timeline_id=parent_timeline_id,
+            env.initial_tenant,
+            parent_timeline_id,
             state=TimelineArchivalState.ARCHIVED,
         )
 
     assert exc.value.status_code == 412
 
-    # Test timeline_detail
     leaf_detail = ps_http.timeline_detail(
-        tenant_id=env.initial_tenant,
+        env.initial_tenant,
         timeline_id=leaf_timeline_id,
     )
     assert leaf_detail["is_archived"] is False
 
     # Test that archiving the leaf timeline and then the parent works
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=leaf_timeline_id,
+        env.initial_tenant,
+        leaf_timeline_id,
         state=TimelineArchivalState.ARCHIVED,
     )
     leaf_detail = ps_http.timeline_detail(
-        tenant_id=env.initial_tenant,
-        timeline_id=leaf_timeline_id,
+        env.initial_tenant,
+        leaf_timeline_id,
     )
     assert leaf_detail["is_archived"] is True
 
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=parent_timeline_id,
+        env.initial_tenant,
+        parent_timeline_id,
         state=TimelineArchivalState.ARCHIVED,
     )
 
@@ -100,23 +93,21 @@ def test_timeline_archive(neon_simple_env: NeonEnv):
         PageserverApiException,
         match="ancestor is archived",
     ) as exc:
-        assert timeline_path.exists()
-
         ps_http.timeline_archival_config(
-            tenant_id=env.initial_tenant,
-            timeline_id=leaf_timeline_id,
+            env.initial_tenant,
+            leaf_timeline_id,
             state=TimelineArchivalState.UNARCHIVED,
         )
 
     # Unarchive works for the leaf if the parent gets unarchived first
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=parent_timeline_id,
+        env.initial_tenant,
+        parent_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
 
     ps_http.timeline_archival_config(
-        tenant_id=env.initial_tenant,
-        timeline_id=leaf_timeline_id,
+        env.initial_tenant,
+        leaf_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )

From 15e90cc427aad8f9ded4e0c13a283217631cbd07 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Sep 2024 15:45:17 +0200
Subject: [PATCH 071/159] bottommost-compaction: remove dead code / rectify
 cfg!()s (#8884)

part of https://github.com/neondatabase/neon/issues/8002
---
 .../src/tenant/storage_layer/delta_layer.rs   | 40 -------------------
 .../src/tenant/storage_layer/image_layer.rs   | 32 ++-------------
 pageserver/src/tenant/storage_layer/layer.rs  | 31 --------------
 .../src/tenant/storage_layer/layer/tests.rs   |  2 +-
 .../src/tenant/storage_layer/split_writer.rs  |  2 +-
 pageserver/src/tenant/timeline.rs             |  1 -
 6 files changed, 5 insertions(+), 103 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 885eb13b29..b8e9a98149 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -225,9 +225,7 @@ pub struct DeltaLayerInner {
     file: VirtualFile,
     file_id: FileId,
 
-    #[allow(dead_code)]
     layer_key_range: Range<Key>,
-    #[allow(dead_code)]
     layer_lsn_range: Range<Lsn>,
 
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -882,44 +880,6 @@ impl DeltaLayerInner {
         Ok(())
     }
 
-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
-        let mut result = Vec::new();
-        let mut stream =
-            Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        while let Some(item) = stream.next().await {
-            let (key, lsn, pos) = item?;
-            // TODO: dedup code with get_reconstruct_value
-            // TODO: ctx handling and sharding
-            cursor
-                .read_blob_into_buf(pos.pos(), &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            result.push((key, lsn, val));
-        }
-        Ok(result)
-    }
-
     async fn plan_reads<Reader>(
         keyspace: &KeySpace,
         lsn_range: Range<Lsn>,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 4c22541e02..4a095c564d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -28,7 +28,7 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
+use crate::tenant::block_io::{BlockBuf, FileBlockReader};
 use crate::tenant::disk_btree::{
     DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
@@ -453,33 +453,6 @@ impl ImageLayerInner {
         Ok(())
     }
 
-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-        let mut result = Vec::new();
-        let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        while let Some(item) = stream.next().await {
-            // TODO: dedup code with get_reconstruct_value
-            let (raw_key, offset) = item?;
-            let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-            // TODO: ctx handling and sharding
-            let blob = cursor
-                .read_blob(offset, ctx)
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-            result.push((key, self.lsn, Value::Image(value)));
-        }
-        Ok(result)
-    }
-
     /// Traverse the layer's index to build read operations on the overlap of the input keyspace
     /// and the keys in this layer.
     ///
@@ -711,7 +684,7 @@ struct ImageLayerWriterInner {
     blob_writer: BlobWriter<false>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 
-    #[cfg_attr(not(feature = "testing"), allow(dead_code))]
+    #[cfg(feature = "testing")]
     last_written_key: Key,
 }
 
@@ -770,6 +743,7 @@ impl ImageLayerWriterInner {
             uncompressed_bytes_eligible: 0,
             uncompressed_bytes_chosen: 0,
             num_keys: 0,
+            #[cfg(feature = "testing")]
             last_written_key: Key::MIN,
         };
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 86a200ce28..56f5cc556d 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -14,7 +14,6 @@ use utils::sync::{gate, heavier_once_cell};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
@@ -334,23 +333,6 @@ impl Layer {
             })
     }
 
-    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
-    #[allow(dead_code)]
-    pub(crate) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        let layer = self
-            .0
-            .get_or_maybe_download(true, Some(ctx))
-            .await
-            .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
-        layer.load_key_values(&self.0, ctx).await
-    }
-
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
@@ -1777,19 +1759,6 @@ impl DownloadedLayer {
         }
     }
 
-    async fn load_key_values(
-        &self,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => d.load_key_values(ctx).await,
-            Image(i) => i.load_key_values(ctx).await,
-        }
-    }
-
     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
         use LayerKind::*;
         match self.get(owner, ctx).await? {
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index bffd2db800..0b9bde4f57 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
         let mut writer = timeline.writer().await;
         writer
             .put(
-                Key::from_i128(5),
+                crate::repository::Key::from_i128(5),
                 Lsn(0x20),
                 &Value::Image(Bytes::from_static(b"this does not matter either")),
                 &ctx,
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index df910b5ad9..e8deb0a1e5 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -353,7 +353,7 @@ impl SplitDeltaLayerWriter {
         Ok(generated_layers)
     }
 
-    #[allow(dead_code)]
+    #[cfg(test)]
     pub(crate) async fn finish(
         self,
         tline: &Arc<Timeline>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 35e0825bac..6eadf9a564 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4537,7 +4537,6 @@ pub struct DeltaLayerTestDesc {
 
 #[cfg(test)]
 impl DeltaLayerTestDesc {
-    #[allow(dead_code)]
     pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
         Self {
             lsn_range,

From bf0531d10703e1f6cd92e29ca69a9bb68503121e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Sep 2024 16:10:10 +0200
Subject: [PATCH 072/159] fixup(#8839): `test_forward_compatibility` needs to
 allow lag warning as well (#8891)

Found in
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8885/10665614629/index.html#suites/0fbaeb107ef328d03993d44a1fb15690/ea10ba1c140fba1d
---
 test_runner/regress/test_compatibility.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index cd3f405a86..467e5b1734 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -149,6 +149,10 @@ def test_create_snapshot(
     )
 
 
+# check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
+ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
+
+
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
@@ -173,10 +177,6 @@ def test_backward_compatibility(
     try:
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
-        # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
-        ingest_lag_log_line = (
-            ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
-        )
         env.pageserver.allowed_errors.append(ingest_lag_log_line)
         neon_env_builder.start()
 
@@ -246,6 +246,8 @@ def test_forward_compatibility(
         env = neon_env_builder.from_repo_dir(
             compatibility_snapshot_dir / "repo",
         )
+        # there may be an arbitrary number of unrelated tests run between create_snapshot and here
+        env.pageserver.allowed_errors.append(ingest_lag_log_line)
 
         # not using env.pageserver.version because it was initialized before
         prev_pageserver_version_str = env.get_binary_version("pageserver")

From 3b317cae071a7eb84247e616e15541868e292bd3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Sep 2024 17:09:26 +0200
Subject: [PATCH 073/159] page_cache/layer load: correctly classify layer
 summary block reads (#8885)

Before this PR, we would classify layer summary block reads as "Unknown"
content kind.

<img width="1267" alt="image"
src="https://github.com/user-attachments/assets/508af034-5c2a-4c89-80db-2899967b337c">
---
 pageserver/src/context.rs                    |  2 ++
 pageserver/src/tenant/storage_layer/layer.rs | 12 +++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 0b07e07524..012cb8d96f 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -105,8 +105,10 @@ pub struct RequestContext {
 #[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
 pub enum PageContentKind {
     Unknown,
+    DeltaLayerSummary,
     DeltaLayerBtreeNode,
     DeltaLayerValue,
+    ImageLayerSummary,
     ImageLayerBtreeNode,
     ImageLayerValue,
     InMemoryLayer,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 56f5cc556d..b15cd4da39 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -13,7 +13,7 @@ use utils::lsn::Lsn;
 use utils::sync::{gate, heavier_once_cell};
 
 use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
@@ -1678,6 +1678,9 @@ impl DownloadedLayer {
             );
 
             let res = if owner.desc.is_delta {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
+                    .build();
                 let summary = Some(delta_layer::Summary::expected(
                     owner.desc.tenant_shard_id.tenant_id,
                     owner.desc.timeline_id,
@@ -1688,11 +1691,14 @@ impl DownloadedLayer {
                     &owner.path,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    ctx,
+                    &ctx,
                 )
                 .await
                 .map(LayerKind::Delta)
             } else {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
+                    .build();
                 let lsn = owner.desc.image_layer_lsn();
                 let summary = Some(image_layer::Summary::expected(
                     owner.desc.tenant_shard_id.tenant_id,
@@ -1705,7 +1711,7 @@ impl DownloadedLayer {
                     lsn,
                     summary,
                     Some(owner.conf.max_vectored_read_bytes),
-                    ctx,
+                    &ctx,
                 )
                 .await
                 .map(LayerKind::Image)

From b37da32c6f56f31f39661c9364a7a662df59dbbc Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@grinaker.org>
Date: Tue, 3 Sep 2024 10:05:24 +0200
Subject: [PATCH 074/159] pageserver: reuse idempotency keys across metrics
 sinks (#8876)

## Problem

Metrics event idempotency keys differ across S3 and Vector. The events
should be identical.

Resolves #8605.

## Summary of changes

Pre-generate the idempotency keys and pass the same set into both
metrics sinks.

Co-authored-by: John Spray <john@neon.tech>
---
 pageserver/src/consumption_metrics.rs        | 24 +++++++--
 pageserver/src/consumption_metrics/upload.rs | 52 +++++++++++---------
 2 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f94d945d46..64a267e0e4 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,6 +1,8 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
 use crate::config::PageServerConf;
+use crate::consumption_metrics::metrics::MetricsKey;
+use crate::consumption_metrics::upload::KeyGen as _;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::size::CalculateSyntheticSizeError;
@@ -8,6 +10,7 @@ use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
+use itertools::Itertools as _;
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
@@ -19,9 +22,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;
 
-mod metrics;
-use crate::consumption_metrics::metrics::MetricsKey;
 mod disk_cache;
+mod metrics;
 mod upload;
 
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
@@ -143,6 +145,12 @@ async fn collect_metrics(
         // these are point in time, with variable "now"
         let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
 
+        // Pre-generate event idempotency keys, to reuse them across the bucket
+        // and HTTP sinks.
+        let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
+            .take(metrics.len())
+            .collect_vec();
+
         let metrics = Arc::new(metrics);
 
         // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -161,8 +169,14 @@ async fn collect_metrics(
             }
 
             if let Some(bucket_client) = &bucket_client {
-                let res =
-                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
+                let res = upload::upload_metrics_bucket(
+                    bucket_client,
+                    &cancel,
+                    &node_id,
+                    &metrics,
+                    &idempotency_keys,
+                )
+                .await;
                 if let Err(e) = res {
                     tracing::error!("failed to upload to S3: {e:#}");
                 }
@@ -174,9 +188,9 @@ async fn collect_metrics(
                 &client,
                 metric_collection_endpoint,
                 &cancel,
-                &node_id,
                 &metrics,
                 &mut cached_metrics,
+                &idempotency_keys,
             )
             .await;
             if let Err(e) = res {
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 4e8283c3e4..0325ee403a 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
     client: &reqwest::Client,
     metric_collection_endpoint: &reqwest::Url,
     cancel: &CancellationToken,
-    node_id: &str,
     metrics: &[RawMetric],
     cached_metrics: &mut Cache,
+    idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
     let mut uploaded = 0;
     let mut failed = 0;
 
     let started_at = std::time::Instant::now();
 
-    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
+    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
 
     while let Some(res) = iter.next() {
         let (chunk, body) = res?;
@@ -87,6 +87,7 @@ pub(super) async fn upload_metrics_bucket(
     cancel: &CancellationToken,
     node_id: &str,
     metrics: &[RawMetric],
+    idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
     if metrics.is_empty() {
         // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
@@ -106,7 +107,7 @@ pub(super) async fn upload_metrics_bucket(
 
     // Serialize and write into compressed buffer
     let started_at = std::time::Instant::now();
-    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
+    for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
         let (_chunk, body) = res?;
         gzip_writer.write_all(&body).await?;
     }
@@ -134,29 +135,31 @@ pub(super) async fn upload_metrics_bucket(
     Ok(())
 }
 
-// The return type is quite ugly, but we gain testability in isolation
-fn serialize_in_chunks<'a, F>(
+/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
+/// idempotency keys are injected into the corresponding metric events (reused
+/// across different metrics sinks), and must have the same length as input.
+fn serialize_in_chunks<'a>(
     chunk_size: usize,
     input: &'a [RawMetric],
-    factory: F,
+    idempotency_keys: &'a [IdempotencyKey<'a>],
 ) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
-where
-    F: KeyGen<'a> + 'a,
 {
     use bytes::BufMut;
 
-    struct Iter<'a, F> {
+    assert_eq!(input.len(), idempotency_keys.len());
+
+    struct Iter<'a> {
         inner: std::slice::Chunks<'a, RawMetric>,
+        idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
         chunk_size: usize,
 
         // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
         buffer: bytes::BytesMut,
         // chunk amount of events are reused to produce the serialized document
         scratch: Vec<Event<Ids, Name>>,
-        factory: F,
     }
 
-    impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
+    impl<'a> Iterator for Iter<'a> {
         type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
 
         fn next(&mut self) -> Option<Self::Item> {
@@ -167,17 +170,14 @@ where
                 self.scratch.extend(
                     chunk
                         .iter()
-                        .map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
+                        .zip(&mut self.idempotency_keys)
+                        .map(|(raw_metric, key)| raw_metric.as_event(key)),
                 );
             } else {
                 // next rounds: update_in_place to reuse allocations
                 assert_eq!(self.scratch.len(), self.chunk_size);
-                self.scratch
-                    .iter_mut()
-                    .zip(chunk.iter())
-                    .for_each(|(slot, raw_metric)| {
-                        raw_metric.update_in_place(slot, &self.factory.generate())
-                    });
+                itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
+                    .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
             }
 
             let res = serde_json::to_writer(
@@ -198,18 +198,19 @@ where
         }
     }
 
-    impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
+    impl<'a> ExactSizeIterator for Iter<'a> {}
 
     let buffer = bytes::BytesMut::new();
     let inner = input.chunks(chunk_size);
+    let idempotency_keys = idempotency_keys.iter();
     let scratch = Vec::new();
 
     Iter {
         inner,
+        idempotency_keys,
         chunk_size,
         buffer,
         scratch,
-        factory,
     }
 }
 
@@ -268,7 +269,7 @@ impl RawMetricExt for RawMetric {
     }
 }
 
-trait KeyGen<'a>: Copy {
+pub(crate) trait KeyGen<'a> {
     fn generate(&self) -> IdempotencyKey<'a>;
 }
 
@@ -389,7 +390,10 @@ mod tests {
         let examples = metric_samples();
         assert!(examples.len() > 1);
 
-        let factory = FixedGen::new(Utc::now(), "1", 42);
+        let now = Utc::now();
+        let idempotency_keys = (0..examples.len())
+            .map(|i| FixedGen::new(now, "1", i as u16).generate())
+            .collect::<Vec<_>>();
 
         // need to use Event here because serde_json::Value uses default hashmap, not linked
         // hashmap
@@ -398,13 +402,13 @@ mod tests {
             events: Vec<Event<Ids, Name>>,
         }
 
-        let correct = serialize_in_chunks(examples.len(), &examples, factory)
+        let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
             .map(|res| res.unwrap().1)
             .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
             .collect::<Vec<_>>();
 
         for chunk_size in 1..examples.len() {
-            let actual = serialize_in_chunks(chunk_size, &examples, factory)
+            let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
                 .map(|res| res.unwrap().1)
                 .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
                 .collect::<Vec<_>>();

From c43e664ff577d4568722e4e7a2b2c6267b609607 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 3 Sep 2024 15:11:30 +0100
Subject: [PATCH 075/159] storcon: provide an az id in metadata.json from neon
 local (#8897)

## Problem
Neon local set-up does not inject an az id in `metadata.json`. See real
change in https://github.com/neondatabase/neon/pull/8852.

## Summary of changes
We piggyback on the existing `availability_zone` pageserver
configuration in order to avoid making neon local even more complex.
---
 control_plane/src/pageserver.rs               | 23 ++++++++++++++++++-
 test_runner/fixtures/neon_fixtures.py         | 12 +++++-----
 .../fixtures/pageserver/allowed_errors.py     |  3 ---
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 399b1c2653..31777eb7a5 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -181,6 +181,23 @@ impl PageServerNode {
         );
         io::stdout().flush()?;
 
+        // If the config file we got as a CLI argument includes the `availability_zone`
+        // config, then use that to populate the `metadata.json` file for the pageserver.
+        // In production the deployment orchestrator does this for us.
+        let az_id = conf
+            .other
+            .get("availability_zone")
+            .map(|toml| {
+                let az_str = toml.to_string();
+                // Trim the (") chars from the toml representation
+                if az_str.starts_with('"') && az_str.ends_with('"') {
+                    az_str[1..az_str.len() - 1].to_string()
+                } else {
+                    az_str
+                }
+            })
+            .unwrap_or("local".to_string());
+
         let config = self
             .pageserver_init_make_toml(conf)
             .context("make pageserver toml")?;
@@ -216,6 +233,7 @@ impl PageServerNode {
         let (_http_host, http_port) =
             parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
         let http_port = http_port.unwrap_or(9898);
+
         // Intentionally hand-craft JSON: this acts as an implicit format compat test
         // in case the pageserver-side structure is edited, and reflects the real life
         // situation: the metadata is written by some other script.
@@ -226,7 +244,10 @@ impl PageServerNode {
                 postgres_port: self.pg_connection_config.port(),
                 http_host: "localhost".to_string(),
                 http_port,
-                other: HashMap::new(),
+                other: HashMap::from([(
+                    "availability_zone_id".to_string(),
+                    serde_json::json!(az_id),
+                )]),
             })
             .unwrap(),
         )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 800ae03d13..0cbab71cc3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1164,6 +1164,8 @@ class NeonEnv:
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
+                # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
+                "availability_zone": "us-east-2a",
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
@@ -1192,11 +1194,7 @@ class NeonEnv:
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
-                NeonPageserver(
-                    self,
-                    ps_id,
-                    port=pageserver_port,
-                )
+                NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"])
             )
             cfg["pageservers"].append(ps_cfg)
 
@@ -2400,6 +2398,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             "listen_http_port": node.service_port.http,
             "listen_pg_addr": "localhost",
             "listen_pg_port": node.service_port.pg,
+            "availability_zone_id": node.az_id,
         }
         log.info(f"node_register({body})")
         self.request(
@@ -2923,10 +2922,11 @@ class NeonPageserver(PgProtocol, LogUtils):
 
     TEMP_FILE_SUFFIX = "___temp"
 
-    def __init__(self, env: NeonEnv, id: int, port: PageserverPort):
+    def __init__(self, env: NeonEnv, id: int, port: PageserverPort, az_id: str):
         super().__init__(host="localhost", port=port.pg, user="cloud_admin")
         self.env = env
         self.id = id
+        self.az_id = az_id
         self.running = False
         self.service_port = port
         self.version = env.get_binary_version("pageserver")
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 70f2676245..f8d9a51c91 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -109,9 +109,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # controller's attempts to notify the endpoint).
     ".*reconciler.*neon_local notification hook failed.*",
     ".*reconciler.*neon_local error.*",
-    # Neon local does not provide pageserver with an AZ
-    # TODO: remove this once neon local does so
-    ".*registering without specific availability zone id.*",
 ]
 
 
From 3916810f203cb086d4d6f6db760a39e5cffb0223 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 28 Aug 2024 17:39:13 +0300
Subject: [PATCH 076/159] safekeeper: add remote_path to Timeline

It is used in many places, let's reduce number of ? on construction
results.
---
 safekeeper/src/pull_timeline.rs      |  6 +++---
 safekeeper/src/timeline.rs           |  8 +++++++-
 safekeeper/src/timeline_eviction.rs  | 12 ++++--------
 safekeeper/src/wal_backup.rs         |  2 +-
 safekeeper/src/wal_backup_partial.rs | 10 ++--------
 5 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 1eacec9981..600a6bd8f0 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -183,10 +183,10 @@ impl WalResidentTimeline {
                 "Replacing uploaded partial segment in in-mem control file: {replace:?}"
             );
 
-            let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?;
+            let remote_timeline_path = &self.tli.remote_path;
             wal_backup::copy_partial_segment(
-                &replace.previous.remote_path(&remote_timeline_path),
-                &replace.current.remote_path(&remote_timeline_path),
+                &replace.previous.remote_path(remote_timeline_path),
+                &replace.current.remote_path(remote_timeline_path),
             )
             .await?;
         }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 57935d879f..f7c96d4f02 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,6 +3,7 @@
 
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
+use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -36,7 +37,7 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim
 use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::{self};
+use crate::wal_backup::{self, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
@@ -469,6 +470,7 @@ impl From<TimelineError> for ApiError {
 /// It also holds SharedState and provides mutually exclusive access to it.
 pub struct Timeline {
     pub ttid: TenantTimelineId,
+    pub remote_path: RemotePath,
 
     /// Used to broadcast commit_lsn updates to all background jobs.
     commit_lsn_watch_tx: watch::Sender<Lsn>,
@@ -519,8 +521,10 @@ impl Timeline {
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
 
         let walreceivers = WalReceivers::new();
+        let remote_path = remote_timeline_path(&ttid)?;
         Ok(Timeline {
             ttid,
+            remote_path,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
@@ -557,8 +561,10 @@ impl Timeline {
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
 
         let walreceivers = WalReceivers::new();
+        let remote_path = remote_timeline_path(&ttid)?;
         Ok(Timeline {
             ttid,
+            remote_path,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index ae6f3f4b7e..2ccb058720 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -167,7 +167,7 @@ async fn redownload_partial_segment(
     partial: &PartialRemoteSegment,
 ) -> anyhow::Result<()> {
     let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
-    let remote_segfile = remote_segment_path(mgr, partial)?;
+    let remote_segfile = remote_segment_path(mgr, partial);
 
     debug!(
         "redownloading partial segment: {} -> {}",
@@ -252,7 +252,7 @@ async fn do_validation(
         );
     }
 
-    let remote_segfile = remote_segment_path(mgr, partial)?;
+    let remote_segfile = remote_segment_path(mgr, partial);
     let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
         wal_backup::read_object(&remote_segfile, 0).await?;
 
@@ -279,12 +279,8 @@ fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8Path
     local_partial_segfile
 }
 
-fn remote_segment_path(
-    mgr: &Manager,
-    partial: &PartialRemoteSegment,
-) -> anyhow::Result<RemotePath> {
-    let remote_timeline_path = wal_backup::remote_timeline_path(&mgr.tli.ttid)?;
-    Ok(partial.remote_path(&remote_timeline_path))
+fn remote_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> RemotePath {
+    partial.remote_path(&mgr.tli.remote_path)
 }
 
 /// Compare first `n` bytes of two readers. If the bytes differ, return an error.
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index aa1a6696a1..1c9ec5c007 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -315,7 +315,7 @@ async fn backup_lsn_range(
         anyhow::bail!("parallel_jobs must be >= 1");
     }
 
-    let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
+    let remote_timeline_path = &timeline.remote_path;
     let start_lsn = *backup_lsn;
     let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
 
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 675a051887..4022c9409b 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -31,7 +31,7 @@ use crate::{
     safekeeper::Term,
     timeline::WalResidentTimeline,
     timeline_manager::StateSnapshot,
-    wal_backup::{self, remote_timeline_path},
+    wal_backup::{self},
     SafeKeeperConf,
 };
 
@@ -388,13 +388,7 @@ pub async fn main_task(
     let wal_seg_size = tli.get_wal_seg_size().await;
 
     let local_prefix = tli.get_timeline_dir();
-    let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
-        Ok(path) => path,
-        Err(e) => {
-            error!("failed to create remote path: {:?}", e);
-            return None;
-        }
-    };
+    let remote_timeline_path = tli.remote_path.clone();
 
     let mut backup = PartialBackup {
         wal_seg_size,

From 80512e2779f40af7602fe3221ccc7eaa0499e61e Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 30 Aug 2024 12:35:41 +0300
Subject: [PATCH 077/159] safekeeper: add endpoint resetting uploaded partial
 segment state.

Endpoint implementation sends msg to manager requesting to do the
reset. Manager stops current partial backup upload task if it exists and
performs the reset.

Also slightly tweak eviction condition: all full segments before
flush_lsn must be uploaded (and committed) and there must be only one
segment left on disk (partial). This allows to evict timelines which
started not on the first segment and didn't fill the whole
segment (previous condition wasn't good because last_removed_segno was
0).

ref https://github.com/neondatabase/neon/issues/8759
---
 safekeeper/src/http/routes.rs            |  23 ++++++
 safekeeper/src/timeline.rs               |   4 +
 safekeeper/src/timeline_eviction.rs      |  21 +++--
 safekeeper/src/timeline_manager.rs       |  88 ++++++++++++++++++--
 safekeeper/src/wal_backup.rs             |   6 +-
 safekeeper/src/wal_backup_partial.rs     | 101 +++++++++++++++++------
 test_runner/fixtures/neon_fixtures.py    |   6 +-
 test_runner/fixtures/safekeeper/http.py  |  24 ++++++
 test_runner/regress/test_wal_acceptor.py |  99 +++++++++++++++++++++-
 9 files changed, 325 insertions(+), 47 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 91ffa95c21..9b7424a818 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -389,6 +389,25 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, response)
 }
 
+/// Unevict timeline and remove uploaded partial segment(s) from the remote storage.
+/// Successfull response returns list of segments existed before the deletion.
+/// Aimed for one-off usage not normally needed.
+async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let response = tli
+        .backup_partial_reset()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, response)
+}
+
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -607,6 +626,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
             request_span(r, timeline_digest_handler)
         })
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
+            |r| request_span(r, timeline_backup_partial_reset),
+        )
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f7c96d4f02..95ee925e1a 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -908,6 +908,10 @@ impl Timeline {
 
         Ok(WalResidentTimeline::new(self.clone(), guard))
     }
+
+    pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
+        self.manager_ctl.backup_partial_reset().await
+    }
 }
 
 /// This is a guard that allows to read/write disk timeline state.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 2ccb058720..5d0567575c 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -28,28 +28,38 @@ impl Manager {
     /// - control file is flushed (no next event scheduled)
     /// - no WAL residence guards
     /// - no pushes to the broker
-    /// - partial WAL backup is uploaded
+    /// - last partial WAL segment is uploaded
+    /// - all local segments before the uploaded partial are committed and uploaded
     pub(crate) fn ready_for_eviction(
         &self,
         next_event: &Option<tokio::time::Instant>,
         state: &StateSnapshot,
     ) -> bool {
-        self.backup_task.is_none()
+        let ready = self.backup_task.is_none()
             && self.recovery_task.is_none()
             && self.wal_removal_task.is_none()
             && self.partial_backup_task.is_none()
-            && self.partial_backup_uploaded.is_some()
             && next_event.is_none()
             && self.access_service.is_empty()
             && !self.tli_broker_active.get()
+            // Partial segment of current flush_lsn is uploaded up to this flush_lsn.
             && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
+            // And it is the next one after the last removed. Given that local
+            // WAL is removed only after it is uploaded to s3 (and pageserver
+            // advancing remote_consistent_lsn) which happens only after WAL is
+            // committed, true means all this is done.
+            //
+            // This also works for the first segment despite last_removed_segno
+            // being 0 on init because this 0 triggers run of wal_removal_task
+            // on success of which manager updates the horizon.
             && self
                 .partial_backup_uploaded
                 .as_ref()
                 .unwrap()
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
-                == self.last_removed_segno + 1
+                == self.last_removed_segno + 1;
+        ready
     }
 
     /// Evict the timeline to remote storage.
@@ -83,7 +93,8 @@ impl Manager {
         info!("successfully evicted timeline");
     }
 
-    /// Restore evicted timeline from remote storage.
+    /// Attempt to restore evicted timeline from remote storage; it must be
+    /// offloaded.
     #[instrument(name = "unevict_timeline", skip_all)]
     pub(crate) async fn unevict_timeline(&mut self) {
         assert!(self.is_offloaded);
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 482614fac7..f997f48454 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -11,12 +11,14 @@ use std::{
     time::Duration,
 };
 
+use futures::channel::oneshot;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::{
     task::{JoinError, JoinHandle},
     time::Instant,
 };
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, instrument, warn, Instrument};
 use utils::lsn::Lsn;
 
@@ -33,7 +35,7 @@ use crate::{
     timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_backup_partial::{self, PartialBackup, PartialRemoteSegment},
     SafeKeeperConf,
 };
 
@@ -96,6 +98,8 @@ pub enum ManagerCtlMessage {
     GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
     /// Request to drop the guard.
     GuardDrop(GuardId),
+    /// Request to reset uploaded partial backup state.
+    BackupPartialReset(oneshot::Sender<anyhow::Result<Vec<String>>>),
 }
 
 impl std::fmt::Debug for ManagerCtlMessage {
@@ -103,6 +107,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
         match self {
             ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
             ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
+            ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
         }
     }
 }
@@ -143,6 +148,19 @@ impl ManagerCtl {
             .and_then(std::convert::identity)
     }
 
+    /// Request timeline manager to reset uploaded partial segment state and
+    /// wait for the result.
+    pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
+        let (tx, rx) = oneshot::channel();
+        self.manager_tx
+            .send(ManagerCtlMessage::BackupPartialReset(tx))
+            .expect("manager task is not running");
+        match rx.await {
+            Ok(res) => res,
+            Err(_) => anyhow::bail!("timeline manager is gone"),
+        }
+    }
+
     /// Must be called exactly once to bootstrap the manager.
     pub fn bootstrap_manager(
         &self,
@@ -181,7 +199,8 @@ pub(crate) struct Manager {
     pub(crate) wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>>,
 
     // partial backup
-    pub(crate) partial_backup_task: Option<JoinHandle<Option<PartialRemoteSegment>>>,
+    pub(crate) partial_backup_task:
+        Option<(JoinHandle<Option<PartialRemoteSegment>>, CancellationToken)>,
     pub(crate) partial_backup_uploaded: Option<PartialRemoteSegment>,
 
     // misc
@@ -302,12 +321,12 @@ pub async fn main_task(
             _ = sleep_until(&next_event) => {
                 // we were waiting for some event (e.g. cfile save)
             }
-            res = await_task_finish(&mut mgr.wal_removal_task) => {
+            res = await_task_finish(mgr.wal_removal_task.as_mut()) => {
                 // WAL removal task finished
                 mgr.wal_removal_task = None;
                 mgr.update_wal_removal_end(res);
             }
-            res = await_task_finish(&mut mgr.partial_backup_task) => {
+            res = await_task_finish(mgr.partial_backup_task.as_mut().map(|(handle, _)| handle)) => {
                 // partial backup task finished
                 mgr.partial_backup_task = None;
                 mgr.update_partial_backup_end(res);
@@ -335,8 +354,9 @@ pub async fn main_task(
         }
     }
 
-    if let Some(partial_backup_task) = &mut mgr.partial_backup_task {
-        if let Err(e) = partial_backup_task.await {
+    if let Some((handle, cancel)) = &mut mgr.partial_backup_task {
+        cancel.cancel();
+        if let Err(e) = handle.await {
             warn!("partial backup task failed: {:?}", e);
         }
     }
@@ -560,11 +580,14 @@ impl Manager {
         }
 
         // Get WalResidentTimeline and start partial backup task.
-        self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
+        let cancel = CancellationToken::new();
+        let handle = tokio::spawn(wal_backup_partial::main_task(
             self.wal_resident_timeline(),
             self.conf.clone(),
             self.global_rate_limiter.clone(),
-        )));
+            cancel.clone(),
+        ));
+        self.partial_backup_task = Some((handle, cancel));
     }
 
     /// Update the state after partial WAL backup task finished.
@@ -579,6 +602,39 @@ impl Manager {
         }
     }
 
+    /// Reset partial backup state and remove its remote storage data. Since it
+    /// might concurrently uploading something, cancel the task first.
+    async fn backup_partial_reset(&mut self) -> anyhow::Result<Vec<String>> {
+        info!("resetting partial backup state");
+        // Force unevict timeline if it is evicted before erasing partial backup
+        // state. The intended use of this function is to drop corrupted remote
+        // state; we haven't enabled local files deletion yet anywhere,
+        // so direct switch is safe.
+        if self.is_offloaded {
+            self.tli.switch_to_present().await?;
+            // switch manager state as soon as possible
+            self.is_offloaded = false;
+        }
+
+        if let Some((handle, cancel)) = &mut self.partial_backup_task {
+            cancel.cancel();
+            info!("cancelled partial backup task, awaiting it");
+            // we're going to reset .partial_backup_uploaded to None anyway, so ignore the result
+            handle.await.ok();
+            self.partial_backup_task = None;
+        }
+
+        let tli = self.wal_resident_timeline();
+        let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
+        // Reset might fail e.g. when cfile is already reset but s3 removal
+        // failed, so set manager state to None beforehand. In any case caller
+        // is expected to retry until success.
+        self.partial_backup_uploaded = None;
+        let res = partial_backup.reset().await?;
+        info!("reset is done");
+        Ok(res)
+    }
+
     /// Handle message arrived from ManagerCtl.
     async fn handle_message(&mut self, msg: Option<ManagerCtlMessage>) {
         debug!("received manager message: {:?}", msg);
@@ -602,6 +658,16 @@ impl Manager {
             Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
                 self.access_service.drop_guard(guard_id);
             }
+            Some(ManagerCtlMessage::BackupPartialReset(tx)) => {
+                info!("resetting uploaded partial backup state");
+                let res = self.backup_partial_reset().await;
+                if let Err(ref e) = res {
+                    warn!("failed to reset partial backup state: {:?}", e);
+                }
+                if tx.send(res).is_err() {
+                    warn!("failed to send partial backup reset result, receiver dropped");
+                }
+            }
             None => {
                 // can't happen, we're holding the sender
                 unreachable!();
@@ -619,7 +685,11 @@ async fn sleep_until(option: &Option<tokio::time::Instant>) {
     }
 }
 
-async fn await_task_finish<T>(option: &mut Option<JoinHandle<T>>) -> Result<T, JoinError> {
+/// Future that resolves when the task is finished or never if the task is None.
+///
+/// Note: it accepts Option<&mut> instead of &mut Option<> because mapping the
+/// option to get the latter is hard.
+async fn await_task_finish<T>(option: Option<&mut JoinHandle<T>>) -> Result<T, JoinError> {
     if let Some(task) = option {
         task.await
     } else {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 1c9ec5c007..95012bb004 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -328,11 +328,7 @@ async fn backup_lsn_range(
     loop {
         let added_task = match iter.next() {
             Some(s) => {
-                uploads.push_back(backup_single_segment(
-                    s,
-                    timeline_dir,
-                    &remote_timeline_path,
-                ));
+                uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path));
                 true
             }
             None => false,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4022c9409b..4f320f43f8 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -22,6 +22,7 @@ use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn};
 use utils::{id::NodeId, lsn::Lsn};
 
@@ -145,7 +146,7 @@ impl State {
     }
 }
 
-struct PartialBackup {
+pub struct PartialBackup {
     wal_seg_size: usize,
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,
@@ -155,8 +156,25 @@ struct PartialBackup {
     state: State,
 }
 
-// Read-only methods for getting segment names
 impl PartialBackup {
+    pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup {
+        let (_, persistent_state) = tli.get_state().await;
+        let wal_seg_size = tli.get_wal_seg_size().await;
+
+        let local_prefix = tli.get_timeline_dir();
+        let remote_timeline_path = tli.remote_path.clone();
+
+        PartialBackup {
+            wal_seg_size,
+            tli,
+            state: persistent_state.partial_backup,
+            conf,
+            local_prefix,
+            remote_timeline_path,
+        }
+    }
+
+    // Read-only methods for getting segment names
     fn segno(&self, lsn: Lsn) -> XLogSegNo {
         lsn.segment_number(self.wal_seg_size)
     }
@@ -297,6 +315,18 @@ impl PartialBackup {
         Ok(())
     }
 
+    // Prepend to the given segments remote prefix and delete them from the
+    // remote storage.
+    async fn delete_segments(&self, segments_to_delete: &Vec<String>) -> anyhow::Result<()> {
+        info!("deleting objects: {:?}", segments_to_delete);
+        let mut objects_to_delete = vec![];
+        for seg in segments_to_delete.iter() {
+            let remote_path = self.remote_timeline_path.join(seg);
+            objects_to_delete.push(remote_path);
+        }
+        wal_backup::delete_objects(&objects_to_delete).await
+    }
+
     /// Delete all non-Uploaded segments from the remote storage. There should be only one
     /// Uploaded segment at a time.
     #[instrument(name = "gc", skip_all)]
@@ -329,15 +359,8 @@ impl PartialBackup {
             );
         }
 
-        info!("deleting objects: {:?}", segments_to_delete);
-        let mut objects_to_delete = vec![];
-        for seg in segments_to_delete.iter() {
-            let remote_path = self.remote_timeline_path.join(seg);
-            objects_to_delete.push(remote_path);
-        }
-
-        // removing segments from remote storage
-        wal_backup::delete_objects(&objects_to_delete).await?;
+        // execute the deletion
+        self.delete_segments(&segments_to_delete).await?;
 
         // now we can update the state on disk
         let new_state = {
@@ -349,6 +372,27 @@ impl PartialBackup {
 
         Ok(())
     }
+
+    /// Remove uploaded segment(s) from the state and remote storage. Aimed for
+    /// manual intervention, not normally needed.
+    /// Returns list of segments which potentially existed in the remote storage.
+    pub async fn reset(&mut self) -> anyhow::Result<Vec<String>> {
+        let segments_to_delete = self
+            .state
+            .segments
+            .iter()
+            .map(|seg| seg.name.clone())
+            .collect();
+
+        // First reset cfile state, and only then objects themselves. If the
+        // later fails we might leave some garbage behind; that's ok for this
+        // single time usage.
+        let new_state = State { segments: vec![] };
+        self.commit_state(new_state).await?;
+
+        self.delete_segments(&segments_to_delete).await?;
+        Ok(segments_to_delete)
+    }
 }
 
 /// Check if everything is uploaded and partial backup task doesn't need to run.
@@ -377,27 +421,16 @@ pub async fn main_task(
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,
     limiter: RateLimiter,
+    cancel: CancellationToken,
 ) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
     let mut first_iteration = true;
 
-    let (_, persistent_state) = tli.get_state().await;
     let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
     let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
-    let wal_seg_size = tli.get_wal_seg_size().await;
 
-    let local_prefix = tli.get_timeline_dir();
-    let remote_timeline_path = tli.remote_path.clone();
-
-    let mut backup = PartialBackup {
-        wal_seg_size,
-        tli,
-        state: persistent_state.partial_backup,
-        conf,
-        local_prefix,
-        remote_timeline_path,
-    };
+    let mut backup = PartialBackup::new(tli, conf).await;
 
     debug!("state: {:?}", backup.state);
 
@@ -427,6 +460,10 @@ pub async fn main_task(
                 && flush_lsn_rx.borrow().term == seg.term
             {
                 // we have nothing to do, the last segment is already uploaded
+                debug!(
+                    "exiting, uploaded up to term={} flush_lsn={} commit_lsn={}",
+                    seg.term, seg.flush_lsn, seg.commit_lsn
+                );
                 return Some(seg.clone());
             }
         }
@@ -438,6 +475,10 @@ pub async fn main_task(
                     info!("timeline canceled");
                     return None;
                 }
+                _ = cancel.cancelled() => {
+                    info!("task canceled");
+                    return None;
+                }
                 _ = flush_lsn_rx.changed() => {}
             }
         }
@@ -464,6 +505,10 @@ pub async fn main_task(
                     info!("timeline canceled");
                     return None;
                 }
+                _ = cancel.cancelled() => {
+                    info!("task canceled");
+                    return None;
+                }
                 _ = commit_lsn_rx.changed() => {}
                 _ = flush_lsn_rx.changed() => {
                     let segno = backup.segno(flush_lsn_rx.borrow().lsn);
@@ -486,7 +531,13 @@ pub async fn main_task(
         }
 
         // limit concurrent uploads
-        let _upload_permit = limiter.acquire_partial_backup().await;
+        let _upload_permit = tokio::select! {
+            acq = limiter.acquire_partial_backup() => acq,
+            _ = cancel.cancelled() => {
+                info!("task canceled");
+                return None;
+            }
+        };
 
         let prepared = backup.prepare_upload().await;
         if let Some(seg) = &uploaded_segment {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0cbab71cc3..8c99408cfb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4553,6 +4553,8 @@ class Safekeeper(LogUtils):
     def timeline_dir(self, tenant_id, timeline_id) -> Path:
         return self.data_dir / str(tenant_id) / str(timeline_id)
 
+    # List partial uploaded segments of this safekeeper. Works only for
+    # RemoteStorageKind.LOCAL_FS.
     def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId):
         tline_path = (
             self.env.repo_dir
@@ -4562,9 +4564,11 @@ class Safekeeper(LogUtils):
             / str(timeline_id)
         )
         assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage)
-        return self._list_segments_in_dir(
+        segs = self._list_segments_in_dir(
             tline_path, lambda name: ".metadata" not in name and ".___temp" not in name
         )
+        mysegs = [s for s in segs if f"sk{self.id}" in s]
+        return mysegs
 
     def list_segments(self, tenant_id, timeline_id) -> List[str]:
         """
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 05b43cfb72..9bf03554e7 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -174,6 +174,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def debug_dump_timeline(
+        self, timeline_id: TimelineId, params: Optional[Dict[str, str]] = None
+    ) -> Any:
+        params = params or {}
+        params["timeline_id"] = str(timeline_id)
+        dump = self.debug_dump(params)
+        return dump["timelines"][0]
+
+    def get_partial_backup(self, timeline_id: TimelineId) -> Any:
+        dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"})
+        return dump["control_file"]["partial_backup"]
+
+    def get_eviction_state(self, timeline_id: TimelineId) -> Any:
+        dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"})
+        return dump["control_file"]["eviction_state"]
+
     def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
         res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
         res.raise_for_status()
@@ -228,6 +244,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def backup_partial_reset(self, tenant_id: TenantId, timeline_id: TimelineId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/backup_partial_reset",
+            json={},
+        )
+        res.raise_for_status()
+        return res.json()
+
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
         res = self.post(
             f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 19df834b81..3785651aed 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -72,6 +72,17 @@ def wait_lsn_force_checkpoint(
     wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
 
 
+def wait_lsn_force_checkpoint_at_sk(
+    safekeeper: Safekeeper,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id)
+    wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
 def wait_lsn_force_checkpoint_at(
     lsn: Lsn,
     tenant_id: TenantId,
@@ -79,6 +90,10 @@ def wait_lsn_force_checkpoint_at(
     ps: NeonPageserver,
     pageserver_conn_options=None,
 ):
+    """
+    Wait until pageserver receives given lsn, force checkpoint and wait for
+    upload, i.e. remote_consistent_lsn advancement.
+    """
     pageserver_conn_options = pageserver_conn_options or {}
 
     auth_token = None
@@ -2330,6 +2345,77 @@ def test_s3_eviction(
     assert event_metrics_seen
 
 
+# Test resetting uploaded partial segment state.
+def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    # We want to upload/evict quickly, but not too quickly to check that s3 is
+    # empty before next round of upload happens.
+    # Note: this test fails with --delete-offloaded-wal, this is expected.
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--partial-backup-timeout",
+        "1s",
+        "--control-file-save-interval",
+        "1s",
+        "--eviction-min-resident=1s",
+    ]
+    # XXX: pageserver currently connects to safekeeper as long as connection
+    # manager doesn't remove its entry (default lagging_wal_timeout is 10s),
+    # causing uneviction. It should be fixed to not reconnect if last
+    # remote_consistent_lsn is communicated and there is nothing to fetch. Make
+    # value lower to speed up the test.
+    initial_tenant_conf = {
+        "lagging_wal_timeout": "1s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create("main")
+    endpoint.start()
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.stop()
+    sk = env.safekeepers[0]
+    # eviction won't happen until remote_consistent_lsn catches up.
+    wait_lsn_force_checkpoint_at_sk(sk, tenant_id, timeline_id, env.pageserver)
+
+    http_cli = env.safekeepers[0].http_client()
+
+    # wait until eviction happens
+    def evicted():
+        eviction_state = http_cli.get_eviction_state(timeline_id)
+        log.info(f"eviction_state: {eviction_state}")
+        if isinstance(eviction_state, str) and eviction_state == "Present":
+            raise Exception("eviction didn't happen yet")
+
+    wait_until(30, 1, evicted)
+    # it must have uploaded something
+    uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
+    log.info(f"uploaded segments before reset: {uploaded_segs}")
+    assert len(uploaded_segs) > 0
+
+    reset_res = http_cli.backup_partial_reset(tenant_id, timeline_id)
+    log.info(f"reset res: {reset_res}")
+
+    # Backup_partial_reset must have reset the state and dropped s3 segment.
+    #
+    # Note: if listing takes more than --partial-backup-timeout test becomes
+    # flaky because file might be reuploaded. With local fs it shouldn't be an
+    # issue, but can add retry if this appears.
+    uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
+    log.info(f"uploaded segments after reset: {uploaded_segs}")
+    assert len(uploaded_segs) == 0
+
+    # calling second time should be ok
+    http_cli.backup_partial_reset(tenant_id, timeline_id)
+
+    # inserting data should be ok
+    endpoint.start()
+    endpoint.safe_psql("insert into t values(1, 'hehe')")
+
+
 def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder):
     """
     Verify that pulling timeline from a SK with an uploaded partial segment
@@ -2357,7 +2443,16 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
         "--eviction-min-resident=500ms",
     ]
 
-    env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"})
+    # XXX: pageserver currently connects to safekeeper as long as connection
+    # manager doesn't remove its entry (default lagging_wal_timeout is 10s),
+    # causing uneviction. It should be fixed to not reconnect if last
+    # remote_consistent_lsn is communicated and there is nothing to fetch. Until
+    # this is fixed make value lower to speed up the test.
+    initial_tenant_conf = {
+        "lagging_wal_timeout": "1s",
+        "checkpoint_timeout": "100ms",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
@@ -2421,7 +2516,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
     endpoint.start(safekeepers=[2, 3])
 
     def new_partial_segment_uploaded():
-        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+        segs = dst_sk.list_uploaded_segments(tenant_id, timeline_id)
         for seg in segs:
             if "partial" in seg and "sk3" in seg:
                 return seg

From 83dd7f559c16aa0ed546b9fa6d78e04d32a01de1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 3 Sep 2024 15:35:59 +0300
Subject: [PATCH 078/159] safekeeper: more consistent task naming.

Make all them snake case.
---
 safekeeper/src/broker.rs             | 2 +-
 safekeeper/src/recovery.rs           | 2 +-
 safekeeper/src/wal_backup.rs         | 2 +-
 safekeeper/src/wal_backup_partial.rs | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 7cc2142291..485816408f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -86,7 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
-#[instrument(name = "broker pull", skip_all)]
+#[instrument(name = "broker_pull", skip_all)]
 async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index a59ff07b96..9c4149d8f1 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -35,7 +35,7 @@ use crate::{
 
 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
-#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
+#[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))]
 pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) {
     info!("started");
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 95012bb004..ef26ac99c5 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -203,7 +203,7 @@ struct WalBackupTask {
 }
 
 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
+#[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))]
 async fn backup_task_main(
     tli: WalResidentTimeline,
     parallel_jobs: usize,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4f320f43f8..4050a82fff 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -416,7 +416,7 @@ pub(crate) fn needs_uploading(
 ///
 /// When there is nothing more to do and the last segment was successfully uploaded, the task
 /// returns PartialRemoteSegment, to signal readiness for offloading the timeline.
-#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
+#[instrument(name = "partial_backup", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,

From c7187be8a11a43a0bc74d8745912df4a6c5c1db7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 7 Aug 2024 19:26:06 +0300
Subject: [PATCH 079/159] safekeeper: check for non-consecutive writes in
 safekeeper.rs

wal_storage.rs already checks this, but since this is a quite legit scenario
check it at safekeeper.rs (consensus level) as well.

ref https://github.com/neondatabase/neon/issues/8212

This is a take 2; previous PR #8640 had been reverted because interplay
with another change broke test_last_log_term_switch.
---
 safekeeper/src/safekeeper.rs                  | 126 ++++++++++++++----
 safekeeper/src/wal_storage.rs                 |   6 +
 .../tests/walproposer_sim/safekeeper_disk.rs  |   4 +
 3 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 486954c7b9..dbe0034de2 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,6 +875,29 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
+        // Disallow any non-sequential writes, which can result in gaps or
+        // overwrites. If we need to move the pointer, ProposerElected message
+        // should have truncated WAL first accordingly. Note that the first
+        // condition (WAL rewrite) is quite expected in real world; it happens
+        // when walproposer reconnects to safekeeper and writes some more data
+        // while first connection still gets some packets later. It might be
+        // better to not log this as error! above.
+        let write_lsn = self.wal_store.write_lsn();
+        if write_lsn > msg.h.begin_lsn {
+            bail!(
+                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn
+            );
+        }
+        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
+            bail!(
+                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn,
+            );
+        }
+
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -960,10 +983,7 @@ mod tests {
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
 
     use super::*;
-    use crate::{
-        state::{EvictionState, PersistedPeers, TimelinePersistentState},
-        wal_storage::Storage,
-    };
+    use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
     use std::{ops::Deref, str::FromStr, time::Instant};
 
     // fake storage for tests
@@ -1003,6 +1023,10 @@ mod tests {
     }
 
     impl wal_storage::Storage for DummyWalStore {
+        fn write_lsn(&self) -> Lsn {
+            self.lsn
+        }
+
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
@@ -1076,7 +1100,7 @@ mod tests {
         let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         let mut ar_hdr = AppendRequestHeader {
-            term: 1,
+            term: 2,
             term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
@@ -1090,24 +1114,29 @@ mod tests {
         };
 
         let pem = ProposerElected {
-            term: 1,
-            start_streaming_at: Lsn(3),
-            term_history: TermHistory(vec![TermLsn {
-                term: 1,
-                lsn: Lsn(3),
-            }]),
-            timeline_start_lsn: Lsn(0),
+            term: 2,
+            start_streaming_at: Lsn(1),
+            term_history: TermHistory(vec![
+                TermLsn {
+                    term: 1,
+                    lsn: Lsn(1),
+                },
+                TermLsn {
+                    term: 2,
+                    lsn: Lsn(3),
+                },
+            ]),
+            timeline_start_lsn: Lsn(1),
         };
         sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
             .await
             .unwrap();
 
         // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
-        assert!(resp.is_ok());
-        assert_eq!(sk.get_last_log_term(), 0);
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        assert_eq!(sk.get_last_log_term(), 1);
 
         // but record at term_start_lsn does the switch
         ar_hdr.begin_lsn = Lsn(2);
@@ -1116,12 +1145,63 @@ mod tests {
             h: ar_hdr,
             wal_data: Bytes::from_static(b"b"),
         };
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
-        assert!(resp.is_ok());
-        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_last_log_term(), 1);
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        assert_eq!(sk.get_last_log_term(), 2);
+    }
+
+    #[tokio::test]
+    async fn test_non_consecutive_write() {
+        let storage = InMemoryState {
+            persisted_state: test_sk_state(),
+        };
+        let wal_store = DummyWalStore { lsn: Lsn(0) };
+
+        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
+
+        let pem = ProposerElected {
+            term: 1,
+            start_streaming_at: Lsn(1),
+            term_history: TermHistory(vec![TermLsn {
+                term: 1,
+                lsn: Lsn(1),
+            }]),
+            timeline_start_lsn: Lsn(1),
+        };
+        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
+            .await
+            .unwrap();
+
+        let ar_hdr = AppendRequestHeader {
+            term: 1,
+            term_start_lsn: Lsn(3),
+            begin_lsn: Lsn(1),
+            end_lsn: Lsn(2),
+            commit_lsn: Lsn(0),
+            truncate_lsn: Lsn(0),
+            proposer_uuid: [0; 16],
+        };
+        let append_request = AppendRequest {
+            h: ar_hdr.clone(),
+            wal_data: Bytes::from_static(b"b"),
+        };
+
+        // do write ending at 2, it should be ok
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        let mut ar_hrd2 = ar_hdr.clone();
+        ar_hrd2.begin_lsn = Lsn(4);
+        ar_hrd2.end_lsn = Lsn(5);
+        let append_request = AppendRequest {
+            h: ar_hdr,
+            wal_data: Bytes::from_static(b"b"),
+        };
+        // and now starting at 4, it must fail
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap_err();
     }
 
     #[test]
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6fd7c91a68..89c2e98a94 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,6 +37,8 @@ use pq_proto::SystemId;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 pub trait Storage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -329,6 +331,10 @@ impl PhysicalStorage {
 }
 
 impl Storage for PhysicalStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index 6b31edb1f2..b854754ecf 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -175,6 +175,10 @@ impl DiskWALStorage {
 }
 
 impl wal_storage::Storage for DiskWALStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From c4fe6641c1695b1d7c450358af2cec6018fb2359 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Sep 2024 18:16:49 +0100
Subject: [PATCH 080/159] pageserver: separate metadata and data pages in
 DatadirModification (#8621)

## Problem

Currently, DatadirModification keeps a key-indexed map of all pending
writes, even though we (almost) never need to read back dirty pages for
anything other than metadata pages (e.g. relation sizes).

Related: https://github.com/neondatabase/neon/issues/6345

## Summary of changes

- commit() modifications before ingesting database creation wal records,
so that they are guaranteed to be able to get() everything they need
directly from the underlying Timeline.
- Split dirty pages in DatadirModification into pending_metadata_pages
and pending_data_pages. The data ones don't need to be in a
key-addressable format, so they just go in a Vec instead.
- Special case handling of zero-page writes in DatadirModification,
putting them in a map which is flushed on the end of a WAL record. This
handles the case where during ingest, we might first write a zero page,
and then ingest a postgres write to that page. We used to do this via
the key-indexed map of writes, but in this PR we change the data page
write path to not bother indexing these by key.

My least favorite thing about this PR is that I needed to change the
DatadirModification interface to add the on_record_end call. This is not
very invasive because there's really only one place we use it, but it
changes the object's behaviour from being clearly an aggregation of many
records to having some per-record state. I could avoid this by
implicitly doing the work when someone calls set_lsn or commit -- I'm
open to opinions on whether that's cleaner or dirtier.

## Performance

There may be some efficiency improvement here, but the primary
motivation is to enable an earlier stage of ingest to operate without
access to a Timeline. The `pending_data_pages` part is the "fast path"
bulk write data that can in principle be generated without a Timeline,
in parallel with other ingest batches, and ultimately on the safekeeper.

`test_bulk_insert` on AX102 shows approximately the same results as in
the previous PR #8591:

```
------------------------------ Benchmark results -------------------------------
test_bulk_insert[neon-release-pg16].insert: 23.577 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 637 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 18.264 s
test_bulk_insert[neon-release-pg16].compaction: 0.052 s
```
---
 pageserver/src/import_datadir.rs              |  12 +-
 pageserver/src/pgdatadir_mapping.rs           | 228 +++++++++++++-----
 .../tenant/storage_layer/inmemory_layer.rs    |   9 +-
 .../walreceiver/walreceiver_connection.rs     |  64 ++++-
 pageserver/src/walingest.rs                   |  42 +++-
 pageserver/src/walrecord.rs                   |  24 ++
 6 files changed, 281 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ed409d3130..5a0894cd1b 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
+use crate::walrecord::decode_wal_record;
 use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
@@ -310,11 +311,13 @@ async fn import_wal(
 
         let mut nrecords = 0;
         let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                     .await?;
                 WAL_INGEST.records_committed.inc();
 
@@ -449,11 +452,12 @@ pub async fn import_wal_from_tar(
         waldecoder.feed_bytes(&bytes[offset..]);
 
         let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                     .await?;
                 modification.commit(ctx).await?;
                 last_lsn = lsn;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index edcbac970b..c26abca1f7 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
-use anyhow::{bail, ensure, Context};
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use pageserver_api::key::{
@@ -168,7 +168,9 @@ impl Timeline {
         DatadirModification {
             tline: self,
             pending_lsns: Vec::new(),
-            pending_updates: HashMap::new(),
+            pending_metadata_pages: HashMap::new(),
+            pending_data_pages: Vec::new(),
+            pending_zero_data_pages: Default::default(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
             pending_directory_entries: Vec::new(),
@@ -1031,10 +1033,24 @@ pub struct DatadirModification<'a> {
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
     pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
+    /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
+    /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
+    pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
+
+    /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
+    /// which keys are stored here.
+    pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
+
+    // Sometimes during ingest, for example when extending a relation, we would like to write a zero page.  However,
+    // if we encounter a write from postgres in the same wal record, we will drop this entry.
+    //
+    // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
+    // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
+    pending_zero_data_pages: HashSet<CompactKey>,
+
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1058,6 +1074,10 @@ impl<'a> DatadirModification<'a> {
         self.pending_bytes
     }
 
+    pub(crate) fn has_dirty_data_pages(&self) -> bool {
+        (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -1066,6 +1086,10 @@ impl<'a> DatadirModification<'a> {
             lsn,
             self.lsn
         );
+
+        // If we are advancing LSN, then state from previous wal record should have been flushed.
+        assert!(self.pending_zero_data_pages.is_empty());
+
         if lsn > self.lsn {
             self.pending_lsns.push(self.lsn);
             self.lsn = lsn;
@@ -1073,6 +1097,17 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
+    /// keys that represent literal blocks that postgres can read.  So data includes relation blocks and
+    /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
+    ///
+    /// The distinction is important because data keys are handled on a fast path where dirty writes are
+    /// not readable until this modification is committed, whereas metadata keys are visible for read
+    /// via [`Self::get`] as soon as their record has been ingested.
+    fn is_data_key(key: &Key) -> bool {
+        key.is_rel_block_key() || key.is_slru_block_key()
+    }
+
     /// Initialize a completely new repository.
     ///
     /// This inserts the directory metadata entries that are assumed to
@@ -1180,6 +1215,31 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
+        self.pending_zero_data_pages
+            .insert(rel_block_to_key(rel, blknum).to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+    }
+
+    pub(crate) fn put_slru_page_image_zero(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        blknum: BlockNumber,
+    ) {
+        self.pending_zero_data_pages
+            .insert(slru_block_to_key(kind, segno, blknum).to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+    }
+
+    /// Call this at the end of each WAL record.
+    pub(crate) fn on_record_end(&mut self) {
+        let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
+        for key in pending_zero_data_pages {
+            self.put_data(key, Value::Image(ZERO_PAGE.clone()));
+        }
+    }
+
     /// Store a relmapper file (pg_filenode.map) in the repository
     pub async fn put_relmap_file(
         &mut self,
@@ -1778,7 +1838,7 @@ impl<'a> DatadirModification<'a> {
     /// retains all the metadata, but data pages are flushed. That's again OK
     /// for bulk import, where you are just loading data pages and won't try to
     /// modify the same pages twice.
-    pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         // Unless we have accumulated a decent amount of changes, it's not worth it
         // to scan through the pending_updates list.
         let pending_nblocks = self.pending_nblocks;
@@ -1789,31 +1849,11 @@ impl<'a> DatadirModification<'a> {
         let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
-        for (key, values) in self.pending_updates.drain() {
-            if !key.is_valid_key_on_write_path() {
-                bail!(
-                    "the request contains data not supported by pageserver at TimelineWriter::put: {}", key
-                );
-            }
-            let mut write_batch = Vec::new();
-            for (lsn, value_ser_size, value) in values {
-                if key.is_rel_block_key() || key.is_slru_block_key() {
-                    // This bails out on first error without modifying pending_updates.
-                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
-                } else {
-                    retained_pending_updates.entry(key).or_default().push((
-                        lsn,
-                        value_ser_size,
-                        value,
-                    ));
-                }
-            }
-            writer.put_batch(write_batch, ctx).await?;
-        }
+        let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
 
-        self.pending_updates = retained_pending_updates;
+        // This bails out on first error without modifying pending_updates.
+        // That's Ok, cf this function's doc comment.
+        writer.put_batch(pending_data_pages, ctx).await?;
         self.pending_bytes = 0;
 
         if pending_nblocks != 0 {
@@ -1834,29 +1874,31 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+        // Commit should never be called mid-wal-record
+        assert!(self.pending_zero_data_pages.is_empty());
+
         let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
-        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
-                .pending_updates
+        // Ordering: the items in this batch do not need to be in any global order, but values for
+        // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+        // this to do efficient updates to its index.
+        let mut write_batch = std::mem::take(&mut self.pending_data_pages);
+
+        write_batch.extend(
+            self.pending_metadata_pages
                 .drain()
                 .flat_map(|(key, values)| {
-                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        if !key.is_valid_key_on_write_path() {
-                            bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
-                        }
-                        Ok((key.to_compact(), lsn, val_ser_size, value))
-                    })
-                })
-                .collect::<anyhow::Result<Vec<_>>>()?;
+                    values
+                        .into_iter()
+                        .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
+                }),
+        );
 
-            writer.put_batch(batch, ctx).await?;
+        if !write_batch.is_empty() {
+            writer.put_batch(write_batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
@@ -1887,33 +1929,58 @@ impl<'a> DatadirModification<'a> {
     }
 
     pub(crate) fn len(&self) -> usize {
-        self.pending_updates.len() + self.pending_deletions.len()
+        self.pending_metadata_pages.len()
+            + self.pending_data_pages.len()
+            + self.pending_deletions.len()
     }
 
-    // Internal helper functions to batch the modifications
-
+    /// Read a page from the Timeline we are writing to.  For metadata pages, this passes through
+    /// a cache in Self, which makes writes earlier in this modification visible to WAL records later
+    /// in the modification.
+    ///
+    /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
+    /// page must ensure that the pages they read are already committed in Timeline, for example
+    /// DB create operations are always preceded by a call to commit().  This is special cased because
+    /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
+    /// and not data pages.
     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the latest pending updated
-        // version in that case.
-        //
-        // Note: we don't check pending_deletions. It is an error to request a
-        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, _, value)) = values.last() {
-                return if let Value::Image(img) = value {
-                    Ok(img.clone())
-                } else {
-                    // Currently, we never need to read back a WAL record that we
-                    // inserted in the same "transaction". All the metadata updates
-                    // work directly with Images, and we never need to read actual
-                    // data pages. We could handle this if we had to, by calling
-                    // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::Other(anyhow::anyhow!(
-                        "unexpected pending WAL record"
-                    )))
-                };
+        if !Self::is_data_key(&key) {
+            // Have we already updated the same key? Read the latest pending updated
+            // version in that case.
+            //
+            // Note: we don't check pending_deletions. It is an error to request a
+            // value that has been removed, deletion only avoids leaking storage.
+            if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
+                if let Some((_, _, value)) = values.last() {
+                    return if let Value::Image(img) = value {
+                        Ok(img.clone())
+                    } else {
+                        // Currently, we never need to read back a WAL record that we
+                        // inserted in the same "transaction". All the metadata updates
+                        // work directly with Images, and we never need to read actual
+                        // data pages. We could handle this if we had to, by calling
+                        // the walredo manager, but let's keep it simple for now.
+                        Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "unexpected pending WAL record"
+                        )))
+                    };
+                }
+            }
+        } else {
+            // This is an expensive check, so we only do it in debug mode. If reading a data key,
+            // this key should never be present in pending_data_pages. We ensure this by committing
+            // modifications before ingesting DB create operations, which are the only kind that reads
+            // data pages during ingest.
+            if cfg!(debug_assertions) {
+                for (dirty_key, _, _, _) in &self.pending_data_pages {
+                    debug_assert!(&key.to_compact() != dirty_key);
+                }
+
+                debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
             }
         }
+
+        // Metadata page cache miss, or we're reading a data page.
         let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
         self.tline.get(key, lsn, ctx).await
     }
@@ -1925,11 +1992,40 @@ impl<'a> DatadirModification<'a> {
     }
 
     fn put(&mut self, key: Key, val: Value) {
-        let values = self.pending_updates.entry(key).or_default();
+        if Self::is_data_key(&key) {
+            self.put_data(key.to_compact(), val)
+        } else {
+            self.put_metadata(key.to_compact(), val)
+        }
+    }
+
+    fn put_data(&mut self, key: CompactKey, val: Value) {
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+
+        // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write.  This
+        // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
+        // and the subsequent postgres-originating write
+        if self.pending_zero_data_pages.remove(&key) {
+            self.pending_bytes -= ZERO_PAGE.len();
+        }
+
+        self.pending_bytes += val_serialized_size;
+        self.pending_data_pages
+            .push((key, self.lsn, val_serialized_size, val))
+    }
+
+    fn put_metadata(&mut self, key: CompactKey, val: Value) {
+        let values = self.pending_metadata_pages.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
         if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
             if *last_lsn == self.lsn {
+                // Update the pending_bytes contribution from this entry, and update the serialized size in place
+                self.pending_bytes -= *last_value_ser_size;
                 *last_value_ser_size = val.serialized_size().unwrap() as usize;
+                self.pending_bytes += *last_value_ser_size;
+
+                // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
+                // have been generated by synthesized zero page writes prior to the first real write to a page.
                 *last_value = val;
                 return;
             }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f31ab4b1e8..2c19e5b19f 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -692,8 +692,13 @@ impl InMemoryLayer {
             let vec_map = inner.index.entry(key).or_default();
             let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
             if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!("Key {} at {} already exists", key, lsn);
+                // This should not break anything, but is unexpected: ingestion code aims to filter out
+                // multiple writes to the same key at the same LSN.  This happens in cases where our
+                // ingenstion code generates some write like an empty page, and we see a write from postgres
+                // to the same key in the same wal record.  If one such write makes it through, we
+                // index the most recent write, implicitly ignoring the earlier write.  We log a warning
+                // because this case is unexpected, and we would like tests to fail if this happens.
+                warn!("Key {} at {} written twice at same LSN", key, lsn);
             }
         }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 0114473eda..cee259e2e0 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -31,7 +31,7 @@ use crate::{
     task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
-    walrecord::DecodedWALRecord,
+    walrecord::{decode_wal_record, DecodedWALRecord},
 };
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
@@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection(
                 waldecoder.feed_bytes(data);
 
                 {
-                    let mut decoded = DecodedWALRecord::default();
                     let mut modification = timeline.begin_modification(startlsn);
                     let mut uncommitted_records = 0;
                     let mut filtered_records = 0;
+
+                    async fn commit(
+                        modification: &mut DatadirModification<'_>,
+                        uncommitted: &mut u64,
+                        filtered: &mut u64,
+                        ctx: &RequestContext,
+                    ) -> anyhow::Result<()> {
+                        WAL_INGEST
+                            .records_committed
+                            .inc_by(*uncommitted - *filtered);
+                        modification.commit(ctx).await?;
+                        *uncommitted = 0;
+                        *filtered = 0;
+                        Ok(())
+                    }
+
                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
@@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
+                        // Deserialize WAL record
+                        let mut decoded = DecodedWALRecord::default();
+                        decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
+
+                        if decoded.is_dbase_create_copy(timeline.pg_version)
+                            && uncommitted_records > 0
+                        {
+                            // Special case: legacy PG database creations operate by reading pages from a 'template' database:
+                            // these are the only kinds of WAL record that require reading data blocks while ingesting.  Ensure
+                            // all earlier writes of data blocks are visible by committing any modification in flight.
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
+                        }
+
                         // Ingest the records without immediately committing them.
                         let ingested = walingest
-                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                            .ingest_record(decoded, lsn, &mut modification, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
                         if !ingested {
@@ -349,21 +383,25 @@ pub(super) async fn handle_walreceiver_connection(
                             || modification.approx_pending_bytes()
                                 > DatadirModification::MAX_PENDING_BYTES
                         {
-                            WAL_INGEST
-                                .records_committed
-                                .inc_by(uncommitted_records - filtered_records);
-                            modification.commit(&ctx).await?;
-                            uncommitted_records = 0;
-                            filtered_records = 0;
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
                         }
                     }
 
                     // Commit the remaining records.
                     if uncommitted_records > 0 {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(uncommitted_records - filtered_records);
-                        modification.commit(&ctx).await?;
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
                     }
                 }
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8ccd20adb1..2d3841881b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -57,6 +57,7 @@ use utils::lsn::Lsn;
 
 pub struct WalIngest {
     shard: ShardIdentity,
+    pg_version: u32,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
     warn_ingest_lag: WarnIngestLag,
@@ -82,6 +83,7 @@ impl WalIngest {
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
+            pg_version: timeline.pg_version,
             checkpoint,
             checkpoint_modified: false,
             warn_ingest_lag: WarnIngestLag {
@@ -104,10 +106,9 @@ impl WalIngest {
     ///
     pub async fn ingest_record(
         &mut self,
-        recdata: Bytes,
+        decoded: DecodedWALRecord,
         lsn: Lsn,
         modification: &mut DatadirModification<'_>,
-        decoded: &mut DecodedWALRecord,
         ctx: &RequestContext,
     ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
@@ -115,7 +116,12 @@ impl WalIngest {
         let prev_len = modification.len();
 
         modification.set_lsn(lsn)?;
-        decode_wal_record(recdata, decoded, pg_version)?;
+
+        if decoded.is_dbase_create_copy(self.pg_version) {
+            // Records of this type should always be preceded by a commit(), as they
+            // rely on reading data pages back from the Timeline.
+            assert!(!modification.has_dirty_data_pages());
+        }
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -133,11 +139,11 @@ impl WalIngest {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 // Heap AM records need some special handling, because they modify VM pages
                 // without registering them with the standard mechanism.
-                self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
+                self.ingest_heapam_record(&mut buf, modification, &decoded, ctx)
                     .await?;
             }
             pg_constants::RM_NEON_ID => {
-                self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
+                self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx)
                     .await?;
             }
             // Handle other special record types
@@ -325,7 +331,7 @@ impl WalIngest {
             }
             pg_constants::RM_RELMAP_ID => {
                 let xlrec = XlRelmapUpdate::decode(&mut buf);
-                self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
+                self.ingest_relmap_page(modification, &xlrec, &decoded, ctx)
                     .await?;
             }
             pg_constants::RM_XLOG_ID => {
@@ -470,7 +476,7 @@ impl WalIngest {
 
                 continue;
             }
-            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
+            self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx)
                 .await?;
         }
 
@@ -486,6 +492,8 @@ impl WalIngest {
         // until commit() is called to flush the data into the repository and update
         // the latest LSN.
 
+        modification.on_record_end();
+
         Ok(modification.len() > prev_len)
     }
 
@@ -557,6 +565,7 @@ impl WalIngest {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
+
             self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                 .await?;
         } else {
@@ -1195,7 +1204,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                 // Tail of last remaining FSM page has to be zeroed.
                 // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
                 fsm_physical_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1217,7 +1226,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                 // Tail of last remaining vm page has to be zeroed.
                 // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, vm_page_no);
                 vm_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1687,7 +1696,7 @@ impl WalIngest {
                     continue;
                 }
 
-                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, gap_blknum);
             }
         }
         Ok(())
@@ -1753,7 +1762,7 @@ impl WalIngest {
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum);
             }
         }
         Ok(())
@@ -1827,21 +1836,25 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
 
         assert_current_logical_size(&tline, Lsn(0x50));
@@ -1983,6 +1996,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline
@@ -2008,6 +2022,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline
@@ -2409,7 +2424,6 @@ mod tests {
             .await
             .unwrap();
         let mut modification = tline.begin_modification(startpoint);
-        let mut decoded = DecodedWALRecord::default();
         println!("decoding {} bytes", bytes.len() - xlogoff);
 
         // Decode and ingest wal. We process the wal in chunks because
@@ -2417,8 +2431,10 @@ mod tests {
         for chunk in bytes[xlogoff..].chunks(50) {
             decoder.feed_bytes(chunk);
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap();
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                    .ingest_record(decoded, lsn, &mut modification, &ctx)
                     .instrument(span.clone())
                     .await
                     .unwrap();
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index edddcefbe1..0c4d575de8 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -160,6 +160,30 @@ pub struct DecodedWALRecord {
     pub origin_id: u16,
 }
 
+impl DecodedWALRecord {
+    /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
+    /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
+    /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
+    pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
+        if self.xl_rmid == pg_constants::RM_DBASE_ID {
+            let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            match pg_version {
+                14 => {
+                    // Postgres 14 database creations are always the legacy kind
+                    info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
+                }
+                15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                _ => {
+                    panic!("Unsupported postgres version {pg_version}")
+                }
+            }
+        } else {
+            false
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
 pub struct RelFileNode {

From 1a874a3e863ac613f52eb0bbfe5e8d83bcfaba55 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 17:31:42 +0000
Subject: [PATCH 081/159] build(deps): bump flask-cors from 4.0.1 to 5.0.0
 (#8899)

---
 poetry.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7db91e51f7..b8ef08b02d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1110,13 +1110,13 @@ dotenv = ["python-dotenv"]
 
 [[package]]
 name = "flask-cors"
-version = "4.0.1"
+version = "5.0.0"
 description = "A Flask extension adding a decorator for CORS support"
 optional = false
 python-versions = "*"
 files = [
-    {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
-    {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
+    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
+    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
 ]
 
 [package.dependencies]

From 3d9001d83ff54e8bd6a297c3328408323c4e21ff Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 4 Sep 2024 02:05:06 +0800
Subject: [PATCH 082/159] fix(pageserver): is_archived should be optional
 (#8902)

Set the field to optional, otherwise there will be decode errors when
newer version of the storage controller receives the JSON from older
version of the pageservers.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs | 7 ++++++-
 pageserver/src/http/routes.rs     | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1d896863df..87e8f8305a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -716,12 +716,17 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
-    pub is_archived: bool,
 
     pub walreceiver_status: String,
 
+    // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
+    // Backward compatibility: you will get a JSON not containing the newly-added field.
+    // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
+    // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
+    // read.
     /// The last aux file policy being used on this timeline
     pub last_aux_file_policy: Option<AuxFilePolicy>,
+    pub is_archived: Option<bool>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8cf2c99c09..90ae6c5557 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -468,7 +468,7 @@ async fn build_timeline_info_common(
         pg_version: timeline.pg_version,
 
         state,
-        is_archived,
+        is_archived: Some(is_archived),
 
         walreceiver_status,
 

From ecfa3d9de9eec824800db55f5e9592fe0502c96e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 4 Sep 2024 05:39:56 +0800
Subject: [PATCH 083/159] fix(storage-scrubber): wrong trial condition (#8905)

ref https://github.com/neondatabase/neon/issues/8872

## Summary of changes

We saw stuck storage scrubber in staging caused by infinite retries. I
believe here we should use `min` instead of `max` to avoid getting
minutes or hours of retry backoff.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/lib.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 3c21d2f8cf..3f08cddf50 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -422,7 +422,7 @@ fn stream_objects_with_retries<'a>(
                     let yield_err = if err.is_permanent() {
                         true
                     } else {
-                        let backoff_time = 1 << trial.max(5);
+                        let backoff_time = 1 << trial.min(5);
                         tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                         trial += 1;
                         trial == MAX_RETRIES - 1
@@ -473,7 +473,7 @@ async fn list_objects_with_retries(
                     s3_target.delimiter,
                     DisplayErrorContext(e),
                 );
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }
@@ -492,7 +492,7 @@ async fn download_object_with_retries(
             Ok(response) => response,
             Err(e) => {
                 error!("Failed to download object for key {key}: {e}");
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                 continue;
             }
@@ -508,7 +508,7 @@ async fn download_object_with_retries(
             }
             Err(e) => {
                 error!("Failed to stream object body for key {key}: {e}");
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }

From 75310fe441b87d399213e365f1364aa9f08aa40d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 4 Sep 2024 10:09:41 +0100
Subject: [PATCH 084/159] storcon: make hb interval an argument and speed up
 tests (#8880)

## Problem
Each test might wait for up to 5s in order to HB the pageserver.

## Summary of changes
Make the heartbeat interval configurable and use a really tight one for
neon local => startup quicker
---
 control_plane/src/local_env.rs          |  7 +++++++
 control_plane/src/storage_controller.rs |  2 ++
 storage_controller/src/main.rs          | 12 ++++++++++--
 storage_controller/src/service.rs       |  9 ++++++---
 test_runner/regress/test_tenants.py     |  4 +++-
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 74caba2b56..5dbc3bcbbc 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf {
     pub split_threshold: Option<u64>,
 
     pub max_secondary_lag_bytes: Option<u64>,
+
+    #[serde(with = "humantime_serde")]
+    pub heartbeat_interval: Duration,
 }
 
 impl NeonStorageControllerConf {
@@ -172,6 +175,9 @@ impl NeonStorageControllerConf {
     const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
 
     const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
+
+    // Very tight heartbeat interval to speed up tests
+    const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
 }
 
 impl Default for NeonStorageControllerConf {
@@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf {
             database_url: None,
             split_threshold: None,
             max_secondary_lag_bytes: None,
+            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 27d8e2de0c..c715d6b789 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -437,6 +437,8 @@ impl StorageController {
             &humantime::Duration::from(self.config.max_offline).to_string(),
             "--max-warming-up-interval",
             &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--heartbeat-interval",
+            &humantime::Duration::from(self.config.heartbeat_interval).to_string(),
             "--address-for-peers",
             &address_for_peers.to_string(),
         ]
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index e3f29b84e7..00e90f4467 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
-    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    RECONCILER_CONCURRENCY_DEFAULT,
+    Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT,
+    MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -104,6 +104,10 @@ struct Cli {
     // a pageserver
     #[arg(long)]
     max_secondary_lag_bytes: Option<u64>,
+
+    // Period with which to send heartbeats to registered nodes
+    #[arg(long)]
+    heartbeat_interval: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> {
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
+        heartbeat_interval: args
+            .heartbeat_interval
+            .map(humantime::Duration::into)
+            .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT),
         address_for_peers: args.address_for_peers,
         start_as_candidate: args.start_as_candidate,
         http_service_port: args.listen.port() as i32,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 95821827e2..49253cb4e0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -121,6 +121,9 @@ pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
+/// How often to send heartbeats to registered nodes?
+pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5);
+
 #[derive(Clone, strum_macros::Display)]
 enum TenantOperations {
     Create,
@@ -326,6 +329,8 @@ pub struct Config {
     // upgraded to primary.
     pub max_secondary_lag_bytes: Option<u64>,
 
+    pub heartbeat_interval: Duration,
+
     pub address_for_peers: Option<Uri>,
 
     pub start_as_candidate: bool,
@@ -909,9 +914,7 @@ impl Service {
     async fn spawn_heartbeat_driver(&self) {
         self.startup_complete.clone().wait().await;
 
-        const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
-
-        let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
+        let mut interval = tokio::time::interval(self.config.heartbeat_interval);
         while !self.cancel.is_cancelled() {
             tokio::select! {
               _ = interval.tick() => { }
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 0ebf714de0..b63ff7f6bd 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     tenant_id: TenantId = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # Multiple creation requests which race will generate this error
+    # Multiple creation requests which race will generate this error on the pageserver
+    # and storage controller respectively
     env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
+    env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
 
     # Tenant creation requests which arrive out of order will generate complaints about
     # generation nubmers out of order.

From 7a1397cf376cc4169385f6f19c371179396ada5f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 4 Sep 2024 13:10:05 +0300
Subject: [PATCH 085/159] storcon: boilerplate to upsert safekeeper records on
 deploy (#8879)

We currently do not record safekeepers in the storage controller
database. We want to migrate timelines across safekeepers eventually, so
start recording the safekeepers on deploy.

Cc: #8698
---
 .../2024-08-23-102952_safekeepers/down.sql    |  2 +
 .../2024-08-23-102952_safekeepers/up.sql      | 15 ++++
 storage_controller/src/http.rs                | 57 ++++++++++++
 storage_controller/src/persistence.rs         | 86 +++++++++++++++++++
 storage_controller/src/schema.rs              | 14 +++
 storage_controller/src/service.rs             | 14 +++
 test_runner/fixtures/neon_fixtures.py         | 23 +++++
 .../regress/test_storage_controller.py        | 68 ++++++++++++++-
 8 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
 create mode 100644 storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql

diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
new file mode 100644
index 0000000000..9dfc750586
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
@@ -0,0 +1,2 @@
+-- This file should undo anything in `up.sql`
+DROP TABLE safekeepers;
diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql
new file mode 100644
index 0000000000..c78716660f
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql
@@ -0,0 +1,15 @@
+-- started out as a copy of cplane schema, removed the unnecessary columns.
+CREATE TABLE safekeepers (
+	-- the surrogate identifier defined by control plane database sequence
+	id BIGINT PRIMARY KEY,
+	region_id TEXT NOT NULL,
+	version BIGINT NOT NULL,
+	-- the natural id on whatever cloud platform, not needed in storage controller
+	-- instance_id TEXT UNIQUE NOT NULL,
+	host TEXT NOT NULL,
+	port INTEGER NOT NULL,
+	active BOOLEAN NOT NULL DEFAULT false,
+	-- projects_count INTEGER NOT NULL DEFAULT 0,
+	http_port INTEGER NOT NULL,
+	availability_zone_id TEXT NOT NULL
+);
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index d3eb081be4..0fa4f4fd0e 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -2,6 +2,7 @@ use crate::metrics::{
     HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
     METRICS_REGISTRY,
 };
+use crate::persistence::SafekeeperPersistence;
 use crate::reconciler::ReconcileError;
 use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
@@ -767,6 +768,55 @@ impl From<ReconcileError> for ApiError {
     }
 }
 
+/// Return the safekeeper record by instance id, or 404.
+///
+/// Not used by anything except manual testing.
+async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    let state = get_state(&req);
+
+    let res = state.service.get_safekeeper(id).await;
+
+    match res {
+        Ok(b) => json_response(StatusCode::OK, b),
+        Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
+            Err(ApiError::NotFound("unknown instance_id".into()))
+        }
+        Err(other) => Err(other.into()),
+    }
+}
+
+/// Used as part of deployment scripts.
+///
+/// Assumes information is only relayed to storage controller after first selecting an unique id on
+/// control plane database, which means we have an id field in the request and payload.
+async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let body = json_request::<SafekeeperPersistence>(&mut req).await?;
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    if id != body.id {
+        // it should be repeated
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "id mismatch: url={id:?}, body={:?}",
+            body.id
+        )));
+    }
+
+    let state = get_state(&req);
+
+    state.service.upsert_safekeeper(body).await?;
+
+    Ok(Response::builder()
+        .status(StatusCode::NO_CONTENT)
+        .body(Body::empty())
+        .unwrap())
+}
+
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
 async fn tenant_service_handler<R, H>(
@@ -1127,6 +1177,13 @@ pub fn make_router(
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 6e1c2016ff..d03eb87242 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -938,6 +938,48 @@ impl Persistence {
 
         Ok(())
     }
+
+    pub(crate) async fn safekeeper_get(
+        &self,
+        id: i64,
+    ) -> Result<SafekeeperPersistence, DatabaseError> {
+        use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
+        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
+            Ok(safekeepers
+                .filter(id_column.eq(&id))
+                .select(SafekeeperPersistence::as_select())
+                .get_result(conn)?)
+        })
+        .await
+    }
+
+    pub(crate) async fn safekeeper_upsert(
+        &self,
+        record: SafekeeperPersistence,
+    ) -> Result<(), DatabaseError> {
+        use crate::schema::safekeepers::dsl::*;
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            let bind = record.as_insert_or_update();
+
+            let inserted_updated = diesel::insert_into(safekeepers)
+                .values(&bind)
+                .on_conflict(id)
+                .do_update()
+                .set(&bind)
+                .execute(conn)?;
+
+            if inserted_updated != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({})",
+                    inserted_updated
+                )));
+            }
+
+            Ok(())
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -1073,3 +1115,47 @@ pub(crate) struct ControllerPersistence {
     pub(crate) address: String,
     pub(crate) started_at: chrono::DateTime<chrono::Utc>,
 }
+
+#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)]
+#[diesel(table_name = crate::schema::safekeepers)]
+pub(crate) struct SafekeeperPersistence {
+    pub(crate) id: i64,
+    pub(crate) region_id: String,
+    /// 1 is special, it means just created (not currently posted to storcon).
+    /// Zero or negative is not really expected.
+    /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
+    pub(crate) version: i64,
+    pub(crate) host: String,
+    pub(crate) port: i32,
+    pub(crate) active: bool,
+    pub(crate) http_port: i32,
+    pub(crate) availability_zone_id: String,
+}
+
+impl SafekeeperPersistence {
+    fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> {
+        InsertUpdateSafekeeper {
+            id: self.id,
+            region_id: &self.region_id,
+            version: self.version,
+            host: &self.host,
+            port: self.port,
+            active: self.active,
+            http_port: self.http_port,
+            availability_zone_id: &self.availability_zone_id,
+        }
+    }
+}
+
+#[derive(Insertable, AsChangeset)]
+#[diesel(table_name = crate::schema::safekeepers)]
+struct InsertUpdateSafekeeper<'a> {
+    id: i64,
+    region_id: &'a str,
+    version: i64,
+    host: &'a str,
+    port: i32,
+    active: bool,
+    http_port: i32,
+    availability_zone_id: &'a str,
+}
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 1e8379500c..e0f515daea 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -45,3 +45,17 @@ diesel::table! {
 }
 
 diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
+
+diesel::table! {
+    safekeepers {
+        id -> Int8,
+        region_id -> Text,
+        version -> Int8,
+        instance_id -> Text,
+        host -> Text,
+        port -> Int4,
+        active -> Bool,
+        http_port -> Int4,
+        availability_zone_id -> Text,
+    }
+}
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 49253cb4e0..4ccc5c951c 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6476,4 +6476,18 @@ impl Service {
 
         global_observed
     }
+
+    pub(crate) async fn get_safekeeper(
+        &self,
+        id: i64,
+    ) -> Result<crate::persistence::SafekeeperPersistence, DatabaseError> {
+        self.persistence.safekeeper_get(id).await
+    }
+
+    pub(crate) async fn upsert_safekeeper(
+        &self,
+        record: crate::persistence::SafekeeperPersistence,
+    ) -> Result<(), DatabaseError> {
+        self.persistence.safekeeper_upsert(record).await
+    }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8c99408cfb..890538b86a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2845,6 +2845,29 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         raise AssertionError("unreachable")
 
+    def on_safekeeper_deploy(self, id: int, body: dict[str, Any]):
+        self.request(
+            "POST",
+            f"{self.api}/control/v1/safekeeper/{id}",
+            headers=self.headers(TokenScope.ADMIN),
+            json=body,
+        )
+
+    def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]:
+        try:
+            response = self.request(
+                "GET",
+                f"{self.api}/control/v1/safekeeper/{id}",
+                headers=self.headers(TokenScope.ADMIN),
+            )
+            json = response.json()
+            assert isinstance(json, dict)
+            return json
+        except StorageControllerApiException as e:
+            if e.status_code == 404:
+                return None
+            raise e
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 03eb7628be..13f5ec1b4f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -31,7 +31,7 @@ from fixtures.pageserver.utils import (
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, run_only_on_default_postgres
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.storage_controller_proxy import StorageControllerProxy
@@ -2330,3 +2330,69 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
             connect=0,  # Disable retries: we want to see the 503
         )
     ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    fake_id = 5
+
+    target = env.storage_controller
+
+    assert target.get_safekeeper(fake_id) is None
+
+    body = {
+        "active": True,
+        "id": fake_id,
+        "created_at": "2023-10-25T09:11:25Z",
+        "updated_at": "2024-08-28T11:32:43Z",
+        "region_id": "aws-us-east-2",
+        "host": "safekeeper-333.us-east-2.aws.neon.build",
+        "port": 6401,
+        "http_port": 7676,
+        "version": 5957,
+        "availability_zone_id": "us-east-2b",
+    }
+
+    target.on_safekeeper_deploy(fake_id, body)
+
+    inserted = target.get_safekeeper(fake_id)
+    assert inserted is not None
+    assert eq_safekeeper_records(body, inserted)
+
+    # error out if pk is changed (unexpected)
+    with pytest.raises(StorageControllerApiException) as exc:
+        different_pk = dict(body)
+        different_pk["id"] = 4
+        assert different_pk["id"] != body["id"]
+        target.on_safekeeper_deploy(fake_id, different_pk)
+    assert exc.value.status_code == 400
+
+    inserted_again = target.get_safekeeper(fake_id)
+    assert inserted_again is not None
+    assert eq_safekeeper_records(inserted, inserted_again)
+
+    # the most common case, version goes up:
+    assert isinstance(body["version"], int)
+    body["version"] += 1
+    target.on_safekeeper_deploy(fake_id, body)
+    inserted_now = target.get_safekeeper(fake_id)
+    assert inserted_now is not None
+
+    assert eq_safekeeper_records(body, inserted_now)
+
+
+def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
+    compared = [dict(a), dict(b)]
+
+    masked_keys = ["created_at", "updated_at"]
+
+    for d in compared:
+        # keep deleting these in case we are comparing the body as it will be uploaded by real scripts
+        for key in masked_keys:
+            if key in d:
+                del d[key]
+
+    return compared[0] == compared[1]

From a046717a2409b5291ad341c1f4d26cb1df1a55bd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Sep 2024 14:41:51 +0300
Subject: [PATCH 086/159] Fix submodule refs to point to the correct
 REL_X_STABLE_neon branches (#8910)

Commit cfa45ff5ee (PR #8860) updated the vendor/postgres submodules, but
didn't use the same commit SHAs that were pushed as the corresponding
REL_*_STABLE_neon branches in the postgres repository. The contents were
the same, but the REL_*_STABLE_neon branches pointed to squashed
versions of the commits, whereas the SHAs used in the submodules
referred to the pre-squash revisions.

Note: The vendor/postgres-v14 submodule still doesn't match with the tip
of REL_14_STABLE_neon branch, because there has been one more commit on
that branch since then. That's another confusion which we should fix,
but let's do that separately. This commit doesn't change the code that
gets built in any way, only changes the submodule references to point to
the correct SHAs in the REL_*_STABLE_neon branch histories, rather than
some detached commits.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 48388a5b59..7602e907ab 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 48388a5b597c81c09e28c016650a7156b48717a1
+Subproject commit 7602e907ab30f16188bebfd66b8f297c2889d339
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8aa1ded772..49d5e576a5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8aa1ded7726d416ac8e02600aad387a353478fc7
+Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 95132feffe..6e9a4ff624 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 95132feffe277ce84309d93a42e9aadfd2cb0437
+Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 319e648488..751b9e8679 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.4",
-    "95132feffe277ce84309d93a42e9aadfd2cb0437"
+    "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b"
   ],
   "v15": [
     "15.8",
-    "8aa1ded7726d416ac8e02600aad387a353478fc7"
+    "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e"
   ],
   "v14": [
     "14.13",
-    "48388a5b597c81c09e28c016650a7156b48717a1"
+    "7602e907ab30f16188bebfd66b8f297c2889d339"
   ]
 }

From 3f43823a9b333140ccf21a55ff1316c351bacd58 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Sep 2024 13:41:10 +0100
Subject: [PATCH 087/159] build(deps): bump cryptography from 42.0.4 to 43.0.1
 (#8908)

---
 poetry.lock | 63 ++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index b8ef08b02d..48943a73e9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -985,43 +985,38 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.4"
+version = "43.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
-    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
-    {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
-    {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
-    {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
-    {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
-    {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
-    {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
+    {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
+    {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
+    {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
+    {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
+    {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
+    {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
+    {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
 ]
 
 [package.dependencies]
@@ -1034,7 +1029,7 @@ nox = ["nox"]
 pep8test = ["check-sdist", "click", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 
 [[package]]

From 1a9b54f1d99fb373eddc7f3ff57174031d34c7b6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 4 Sep 2024 15:00:40 +0100
Subject: [PATCH 088/159] storage controller: read from database in validate
 API (#8784)

## Problem

The initial implementation of the validate API treats the in-memory
generations as authoritative.
- This is true when only one storage controller is running, but if a
rogue controller was running that hadn't been shut down properly, and
some pageserver requests were routed to that bad controller, it could
incorrectly return valid=true for stale generations.
- The generation in the main in-memory map gets out of date while a live
migration is in flight, and if the origin location for the migration
tries to do some deletions even though it is in AttachedStale (for
example because it had already started compaction), these might be
wrongly validated + executed.

## Summary of changes

- Continue to do the in-memory check: if this returns valid=false it is
sufficient to reject requests.
- When valid=true, do an additional read from the database to confirm
the generation is fresh.
- Revise behavior for validation on missing shards: this used to always
return valid=true as a convenience for deletions and shard splits, so
that pageservers weren't prevented from completing any enqueued
deletions for these shards after they're gone. However, this becomes
unsafe when we consider split brain scenarios. We could reinstate this
in future if we wanted to store some tombstones for deleted shards.
- Update test_scrubber_physical_gc to cope with the behavioral change:
they must now explicitly flush the deletion queue before splits, to
avoid tripping up on deletions that are enqueued at the time of the
split (these tests assert "scrubber deletes nothing", which check fails
if the split leaves behind some remote objects that are legitimately
GC'able)
- Add `test_storage_controller_validate_during_migration`, which uses
failpoints to create a situation where incorrect generation validation
during a live migration could result in a corruption

The rate of validate calls for tenants is pretty low: it happens as a
consequence deletions from GC and compaction, which are both
concurrency-limited on the pageserver side.
---
 storage_controller/src/http.rs                |   2 +-
 storage_controller/src/persistence.rs         |  70 ++++++++++-
 storage_controller/src/reconciler.rs          |   3 +
 storage_controller/src/service.rs             |  91 ++++++++++----
 .../regress/test_storage_controller.py        | 116 ++++++++++++++++++
 test_runner/regress/test_storage_scrubber.py  |  11 ++
 6 files changed, 261 insertions(+), 32 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 0fa4f4fd0e..32882c201a 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -102,7 +102,7 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.validate(validate_req))
+    json_response(StatusCode::OK, state.service.validate(validate_req).await?)
 }
 
 /// Call into this before attaching a tenant to a pageserver, to acquire a generation number
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index d03eb87242..e801289752 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use itertools::Itertools;
 use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
@@ -91,7 +92,8 @@ pub(crate) enum DatabaseOperation {
     Detach,
     ReAttach,
     IncrementGeneration,
-    PeekGenerations,
+    TenantGenerations,
+    ShardGenerations,
     ListTenantShards,
     InsertTenantShards,
     UpdateTenantShard,
@@ -544,13 +546,13 @@ impl Persistence {
     /// If the tenant doesn't exist, an empty vector is returned.
     ///
     /// Output is sorted by shard number
-    pub(crate) async fn peek_generations(
+    pub(crate) async fn tenant_generations(
         &self,
         filter_tenant_id: TenantId,
     ) -> Result<Vec<ShardGenerationState>, DatabaseError> {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
-            .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
+            .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
                 let result = tenant_shards
                     .filter(tenant_id.eq(filter_tenant_id.to_string()))
                     .select(TenantShardPersistence::as_select())
@@ -572,6 +574,64 @@ impl Persistence {
             .collect())
     }
 
+    /// Read the generation number of specific tenant shards
+    ///
+    /// Output is unsorted.  Output may not include values for all inputs, if they are missing in the database.
+    pub(crate) async fn shard_generations(
+        &self,
+        mut tenant_shard_ids: impl Iterator<Item = &TenantShardId>,
+    ) -> Result<Vec<(TenantShardId, Option<Generation>)>, DatabaseError> {
+        let mut rows = Vec::with_capacity(tenant_shard_ids.size_hint().0);
+
+        // We will chunk our input to avoid composing arbitrarily long `IN` clauses.  Typically we are
+        // called with a single digit number of IDs, but in principle we could be called with tens
+        // of thousands (all the shards on one pageserver) from the generation validation API.
+        loop {
+            // A modest hardcoded chunk size to handle typical cases in a single query but never generate particularly
+            // large query strings.
+            let chunk_ids = tenant_shard_ids.by_ref().take(32);
+
+            // Compose a comma separated list of tuples for matching on (tenant_id, shard_number, shard_count)
+            let in_clause = chunk_ids
+                .map(|tsid| {
+                    format!(
+                        "('{}', {}, {})",
+                        tsid.tenant_id, tsid.shard_number.0, tsid.shard_count.0
+                    )
+                })
+                .join(",");
+
+            // We are done when our iterator gives us nothing to filter on
+            if in_clause.is_empty() {
+                break;
+            }
+
+            let chunk_rows = self
+                .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
+                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                    ).load(conn)?;
+
+                    Ok(result)
+                })
+                .await?;
+            rows.extend(chunk_rows.into_iter())
+        }
+
+        Ok(rows
+            .into_iter()
+            .map(|tsp| {
+                (
+                    tsp.get_tenant_shard_id()
+                        .expect("Bad tenant ID in database"),
+                    tsp.generation.map(|g| Generation::new(g as u32)),
+                )
+            })
+            .collect())
+    }
+
     #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
@@ -983,7 +1043,9 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
+#[derive(
+    QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,
+)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 102a3124d2..83b7b2b4f2 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -17,6 +17,7 @@ use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
+use utils::pausable_failpoint;
 use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
@@ -593,6 +594,8 @@ impl Reconciler {
             notify_attempts += 1;
         }
 
+        pausable_failpoint!("reconciler-live-migrate-post-notify");
+
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
         // this location will be deleted in the general case reconciliation that runs after this.
         let origin_secondary_conf = build_location_config(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4ccc5c951c..90334d10a7 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1854,37 +1854,74 @@ impl Service {
         Ok(response)
     }
 
-    pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse {
-        let locked = self.inner.read().unwrap();
+    pub(crate) async fn validate(
+        &self,
+        validate_req: ValidateRequest,
+    ) -> Result<ValidateResponse, DatabaseError> {
+        // Fast in-memory check: we may reject validation on anything that doesn't match our
+        // in-memory generation for a shard
+        let in_memory_result = {
+            let mut in_memory_result = Vec::new();
+            let locked = self.inner.read().unwrap();
+            for req_tenant in validate_req.tenants {
+                if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
+                    let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
+                    tracing::info!(
+                        "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
+                        req_tenant.id,
+                        req_tenant.gen,
+                        tenant_shard.generation
+                    );
+
+                    in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid));
+                } else {
+                    // This is legal: for example during a shard split the pageserver may still
+                    // have deletions in its queue from the old pre-split shard, or after deletion
+                    // of a tenant that was busy with compaction/gc while being deleted.
+                    tracing::info!(
+                        "Refusing deletion validation for missing shard {}",
+                        req_tenant.id
+                    );
+                }
+            }
+
+            in_memory_result
+        };
+
+        // Database calls to confirm validity for anything that passed the in-memory check.  We must do this
+        // in case of controller split-brain, where some other controller process might have incremented the generation.
+        let db_generations = self
+            .persistence
+            .shard_generations(in_memory_result.iter().filter_map(|i| {
+                if i.2 {
+                    Some(&i.0)
+                } else {
+                    None
+                }
+            }))
+            .await?;
+        let db_generations = db_generations.into_iter().collect::<HashMap<_, _>>();
 
         let mut response = ValidateResponse {
             tenants: Vec::new(),
         };
-
-        for req_tenant in validate_req.tenants {
-            if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
-                tracing::info!(
-                    "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
-                    req_tenant.id,
-                    req_tenant.gen,
-                    tenant_shard.generation
-                );
-                response.tenants.push(ValidateResponseTenant {
-                    id: req_tenant.id,
-                    valid,
-                });
+        for (tenant_shard_id, validate_generation, valid) in in_memory_result.into_iter() {
+            let valid = if valid {
+                let db_generation = db_generations.get(&tenant_shard_id);
+                db_generation == Some(&Some(validate_generation))
             } else {
-                // After tenant deletion, we may approve any validation.  This avoids
-                // spurious warnings on the pageserver if it has pending LSN updates
-                // at the point a deletion happens.
-                response.tenants.push(ValidateResponseTenant {
-                    id: req_tenant.id,
-                    valid: true,
-                });
-            }
+                // If in-memory state says it's invalid, trust that.  It's always safe to fail a validation, at worst
+                // this prevents a pageserver from cleaning up an object in S3.
+                false
+            };
+
+            response.tenants.push(ValidateResponseTenant {
+                id: tenant_shard_id,
+                valid,
+            })
         }
-        response
+
+        Ok(response)
     }
 
     pub(crate) async fn tenant_create(
@@ -3179,7 +3216,7 @@ impl Service {
             // run concurrently with reconciliations, and it is not guaranteed that the node we find here
             // will still be the latest when we're done: we will check generations again at the end of
             // this function to handle that.
-            let generations = self.persistence.peek_generations(tenant_id).await?;
+            let generations = self.persistence.tenant_generations(tenant_id).await?;
 
             if generations
                 .iter()
@@ -3236,7 +3273,7 @@ impl Service {
         // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
         // our remote operation executed on the latest generation and is therefore persistent.
         {
-            let latest_generations = self.persistence.peek_generations(tenant_id).await?;
+            let latest_generations = self.persistence.tenant_generations(tenant_id).await?;
             if latest_generations
                 .into_iter()
                 .map(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 13f5ec1b4f..8da42294b0 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2332,6 +2332,122 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
     ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
 
 
+def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvBuilder):
+    """
+    A correctness edge case: while we are live migrating and a shard's generation is
+    visible to the Reconciler but not to the central Service, the generation validation
+    API should still prevent stale generations from doing deletions.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+    }
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    env.neon_cli.create_tenant(tenant_id, timeline_id)
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF)
+
+    # Write enough data that a compaction would do some work (deleting some L0s)
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(64)
+    for _i in range(0, 2):
+        workload.churn_rows(64, upload=False)
+
+    # Upload but don't compact
+    origin_pageserver = env.get_tenant_pageserver(tenant_id)
+    dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
+    origin_pageserver.http_client().timeline_checkpoint(
+        tenant_id, timeline_id, wait_until_uploaded=True, compact=False
+    )
+
+    # Start a compaction that will pause on a failpoint.
+    compaction_failpoint = "before-upload-index-pausable"
+    origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "pause"))
+
+    # This failpoint can also cause migration code to time out trying to politely flush
+    # during migrations
+    origin_pageserver.allowed_errors.append(".*Timed out waiting for flush to remote storage.*")
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            compact_fut = executor.submit(
+                origin_pageserver.http_client().timeline_compact,
+                tenant_id,
+                timeline_id,
+                wait_until_uploaded=True,
+            )
+
+            # Let the compaction start and then get stuck uploading an index: when we live migrate, the new generation's
+            # index will be initialized from the pre-compaction index, referencing layers that the compaction will try to delete
+            def has_hit_compaction_failpoint():
+                assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
+
+            wait_until(10, 1, has_hit_compaction_failpoint)
+
+            # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
+            # after incrementing generation and attaching the new location
+            migration_failpoint = "reconciler-live-migrate-post-notify"
+            env.storage_controller.configure_failpoints((migration_failpoint, "pause"))
+            migrate_fut = executor.submit(
+                env.storage_controller.tenant_shard_migrate,
+                TenantShardId(tenant_id, 0, 0),
+                dest_ps_id,
+            )
+
+            def has_hit_migration_failpoint():
+                assert env.storage_controller.log_contains(f"at failpoint {migration_failpoint}")
+
+            # Long wait because the migration will have to time out during transition to AttachedStale
+            # before it reaches this point.  The timeout is because the AttachedStale transition includes
+            # a flush of remote storage, and if the compaction already enqueued an index upload this cannot
+            # make progress.
+            wait_until(60, 1, has_hit_migration_failpoint)
+
+            # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
+            origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
+            compact_fut.result()
+            origin_pageserver.http_client().deletion_queue_flush(execute=True)
+
+            # Eventually migration completes
+            env.storage_controller.configure_failpoints((migration_failpoint, "off"))
+            migrate_fut.result()
+    except:
+        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
+        env.storage_controller.configure_failpoints((migration_failpoint, "off"))
+        origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
+        raise
+
+    # Ensure the destination of the migration writes an index, so that if it has corrupt state that is
+    # visible to the scrubber.
+    workload.write_rows(1, upload=False)
+    env.get_pageserver(dest_ps_id).http_client().timeline_checkpoint(
+        tenant_id, timeline_id, wait_until_uploaded=True, compact=False
+    )
+
+    # The destination of the live migration would now have a corrupt index (referencing deleted L0s) if
+    # the controller had not properly applied validation rules.
+    healthy, _summary = env.storage_scrubber.scan_metadata()
+    try:
+        log.info(f"scrubbed, healthy={healthy}")
+        assert healthy
+    except:
+        # On failures, we want to report them FAIL during the test, not as ERROR during teardown
+        neon_env_builder.enable_scrub_on_exit = False
+        raise
+
+
 @run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 292a9a1010..848e214c5e 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -217,6 +217,13 @@ def test_scrubber_physical_gc_ancestors(
     workload.init()
     workload.write_rows(100)
 
+    # Issue a deletion queue flush so that the parent shard can't leave behind layers
+    # that will look like unexpected garbage to the scrubber
+    for pre_split_shard in env.storage_controller.locate(tenant_id):
+        env.get_pageserver(pre_split_shard["node_id"]).http_client().deletion_queue_flush(
+            execute=True
+        )
+
     new_shard_count = 4
     assert shard_count is None or new_shard_count > shard_count
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
@@ -321,6 +328,10 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
     workload.write_rows(100, upload=False)
     workload.stop()
 
+    # Issue a deletion queue flush so that the parent shard can't leave behind layers
+    # that will look like unexpected garbage to the scrubber
+    env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True)
+
     new_shard_count = 4
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
     for shard in shards:

From 0205ce184967f4510b6034bf2051a495bf464b44 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Sep 2024 17:41:51 +0300
Subject: [PATCH 089/159] Update submodule reference for vendor/postgres-v14
 (#8913)

There was a confusion on the REL_14_STABLE_neon branch. PR
https://github.com/neondatabase/postgres/pull/471 was merged ot the
branch, but the corresponding PRs on the other REL_15_STABLE_neon and
REL_16_STABLE_neon branches were not merged. Also, the submodule
reference in the neon repository was never updated, so even though the
REL_14_STABLE_neon branch contained the commit, it was never used.

That PR https://github.com/neondatabase/postgres/pull/471 was a few
bricks shy of a load (no tests, some differences between the different
branches), so to get us to a good state, revert that change from the
REL_14_STABLE_neon branch. This PR in the neon repository updates the
submodule reference past two commites on the REL_14_STABLE_neon branch:
first the commit from PR
https://github.com/neondatabase/postgres/pull/471, and immediately after
that the revert of the same commit. This brings us back to square one,
but now the submodule reference matches the tip of the
REL_14_STABLE_neon branch again.
---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 7602e907ab..a317b9b5b9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 7602e907ab30f16188bebfd66b8f297c2889d339
+Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 751b9e8679..e52576e61f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -9,6 +9,6 @@
   ],
   "v14": [
     "14.13",
-    "7602e907ab30f16188bebfd66b8f297c2889d339"
+    "a317b9b5b96978b49e78986697f3dd80d06f99a7"
   ]
 }

From 99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 5 Sep 2024 04:45:04 +0800
Subject: [PATCH 090/159] fix(pageserver): more information on aux v1 warnings
 (#8906)

Part of https://github.com/neondatabase/neon/issues/8623

## Summary of changes

It seems that we have tenants with aux policy set to v1 but don't have
any aux files in the storage. It is still safe to force migrate them
without notifying the customers. This patch adds more details to the
warning to identify the cases where we have to reach out to the users
before retiring aux v1.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 10 +++++++---
 pageserver/src/tenant/timeline.rs   |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c26abca1f7..d28a214265 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -729,8 +729,12 @@ impl Timeline {
         let current_policy = self.last_aux_file_policy.load();
         match current_policy {
             Some(AuxFilePolicy::V1) => {
-                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
-                self.list_aux_files_v1(lsn, ctx).await
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                let empty_str = if res.is_empty() { ", empty" } else { "" };
+                warn!(
+                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
+                );
+                Ok(res)
             }
             None => {
                 let res = self.list_aux_files_v1(lsn, ctx).await?;
@@ -1657,7 +1661,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
-                    warn!("this timeline is using deprecated aux file policy V1");
+                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
                     self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6eadf9a564..3b8f19a6c0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2243,7 +2243,7 @@ impl Timeline {
             };
 
             if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1");
+                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
             }
 
             result.repartition_threshold =

From 708322ce3c0d55bcee5ee9e3632ecfb8c37415f5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 5 Sep 2024 09:56:26 +0100
Subject: [PATCH 091/159] storcon: handle fills including high tput tenants
 more gracefully (#8865)

## Problem
A tenant may ingest a lot of data between being drained for node restart
and being moved back
in the fill phase. This is expensive and causes the fill to stall.

## Summary of changes
We make a tactical change to reduce secondary warm-up time for
migrations in fills.
---
 storage_controller/src/service.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 90334d10a7..ca416095bb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6297,9 +6297,13 @@ impl Service {
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        // TODO(vlad): Currently this operates on the assumption that all
-        // secondaries are warm. This is not always true (e.g. we just migrated the
-        // tenant). Take that into consideration by checking the secondary status.
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
+        let reconciler_config = ReconcilerConfigBuilder::new()
+            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
+            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
+            .build();
+
         let mut tids_to_promote = self.fill_node_plan(node_id);
         let mut waiters = Vec::new();
 
@@ -6367,9 +6371,11 @@ impl Service {
                                         node_id
                                     );
 
-                                    if let Some(waiter) =
-                                        self.maybe_reconcile_shard(tenant_shard, nodes)
-                                    {
+                                    if let Some(waiter) = self.maybe_configured_reconcile_shard(
+                                        tenant_shard,
+                                        nodes,
+                                        reconciler_config,
+                                    ) {
                                         waiters.push(waiter);
                                     }
                                 }

From 6dfbf49128c4392464d6832ccc2e6bdc390b0b37 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 5 Sep 2024 13:34:27 +0200
Subject: [PATCH 092/159] proxy: don't let one timeout eat entire retry budget
 (#8924)

This reduces the per-request timeout to 10sec while keeping the total
retry duration at 1min.

Relates: neondatabase/cloud#15944
---
 proxy/src/http.rs          | 9 ++++++---
 proxy/src/usage_metrics.rs | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index fee634f67f..c77d95f47d 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -35,14 +35,17 @@ pub fn new_client() -> ClientWithMiddleware {
         .build()
 }
 
-pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+pub(crate) fn new_client_with_timeout(
+    request_timeout: Duration,
+    total_retry_duration: Duration,
+) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
-        .timeout(default_timout)
+        .timeout(request_timeout)
         .build()
         .expect("Failed to create http client with timeout");
 
     let retry_policy =
-        ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
+        ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration);
 
     reqwest_middleware::ClientBuilder::new(timeout_client)
         .with(reqwest_tracing::TracingMiddleware::default())
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index aa8c7ba319..fd8599bcb3 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -33,7 +33,8 @@ use uuid::{NoContext, Timestamp};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
-const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60);
 
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
@@ -223,7 +224,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
         info!("metrics collector has shut down");
     }
 
-    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
+    let http_client = http::new_client_with_timeout(
+        HTTP_REPORTING_REQUEST_TIMEOUT,
+        HTTP_REPORTING_RETRY_DURATION,
+    );
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
     let mut prev = Utc::now();

From 850421ec06dae634b762af0d4a38194eba103884 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 5 Sep 2024 14:59:49 +0200
Subject: [PATCH 093/159] refactor(pageserver): rely on serde derive for toml
 deserialization (#7656)

This PR simplifies the pageserver configuration parsing as follows:

* introduce the `pageserver_api::config::ConfigToml` type
* implement `Default` for `ConfigToml`
* use serde derive to do the brain-dead leg-work of processing the toml
document
  * use `serde(default)` to fill in default values
* in `pageserver` crate:
* use `toml_edit` to deserialize the pageserver.toml string into a
`ConfigToml`
  * `PageServerConfig::parse_and_validate` then
    * consumes the `ConfigToml`
    * destructures it exhaustively into its constituent fields
    * constructs the `PageServerConfig`

The rules are:

* in `ConfigToml`, use `deny_unknown_fields` everywhere
* static default values go in `pageserver_api`
* if there cannot be a static default value (e.g. which default IO
engine to use, because it depends on the runtime), make the field in
`ConfigToml` an `Option`
* if runtime-augmentation of a value is needed, do that in
`parse_and_validate`
* a good example is `virtual_file_io_engine` or `l0_flush`, both of
which need to execute code to determine the effective value in
`PageServerConf`

The benefits:

* massive amount of brain-dead repetitive code can be deleted
* "unused variable" compile-time errors when removing a config value,
due to the exhaustive destructuring in `parse_and_validate`
* compile-time errors guide you when adding a new config field

Drawbacks:

* serde derive is sometimes a bit too magical
* `deny_unknown_fields` is easy to miss

Future Work / Benefits:
* make `neon_local` use `pageserver_api` to construct `ConfigToml` and
write it to `pageserver.toml`
* This provides more type safety / coompile-time errors than the current
approach.

### Refs

Fixes #3682

### Future Work

* `remote_storage` deser doesn't reject unknown fields
https://github.com/neondatabase/neon/issues/8915
* clean up `libs/pageserver_api/src/config.rs` further
  * break up into multiple files, at least for tenant config
* move `models` as appropriate / refine distinction between config and
API models / be explicit about when it's the same
  * use `pub(crate)` visibility on `mod defaults` to detect stale values
---
 Cargo.lock                                    |   13 +
 Cargo.toml                                    |    1 +
 libs/pageserver_api/Cargo.toml                |   10 +
 libs/pageserver_api/src/config.rs             |  527 +++++-
 libs/pageserver_api/src/models.rs             |   71 +-
 libs/remote_storage/src/config.rs             |   25 +
 libs/utils/src/logging.rs                     |   12 +-
 pageserver/Cargo.toml                         |    3 +-
 pageserver/benches/bench_ingest.rs            |    4 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |    3 +-
 pageserver/ctl/src/layers.rs                  |    3 +-
 pageserver/ctl/src/main.rs                    |    3 +-
 pageserver/src/bin/pageserver.rs              |   31 +-
 pageserver/src/config.rs                      | 1539 +++--------------
 pageserver/src/disk_usage_eviction_task.rs    |   48 +-
 pageserver/src/http/routes.rs                 |    4 +-
 pageserver/src/l0_flush.rs                    |   14 +-
 pageserver/src/statvfs.rs                     |   28 +-
 pageserver/src/tenant/config.rs               |  196 +--
 .../src/tenant/storage_layer/delta_layer.rs   |    3 +-
 .../src/tenant/storage_layer/image_layer.rs   |    4 +-
 .../tenant/storage_layer/inmemory_layer.rs    |    2 +-
 pageserver/src/tenant/tasks.rs                |    9 +-
 pageserver/src/tenant/timeline.rs             |    2 +-
 pageserver/src/tenant/timeline/compaction.rs  |   42 +-
 pageserver/src/tenant/vectored_blob_io.rs     |    4 -
 pageserver/src/virtual_file.rs                |    2 +-
 pageserver/src/virtual_file/io_engine.rs      |   11 +-
 test_runner/fixtures/neon_fixtures.py         |   22 +-
 .../regress/test_pageserver_generations.py    |   15 +-
 test_runner/regress/test_timeline_size.py     |    6 +-
 31 files changed, 1001 insertions(+), 1656 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5af3ef3804..91917d5351 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2727,6 +2727,12 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
+[[package]]
+name = "indoc"
+version = "2.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
+
 [[package]]
 name = "infer"
 version = "0.2.3"
@@ -3701,6 +3707,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
+ "indoc",
  "itertools 0.10.5",
  "md5",
  "metrics",
@@ -3766,6 +3773,7 @@ dependencies = [
  "bincode",
  "byteorder",
  "bytes",
+ "camino",
  "chrono",
  "const_format",
  "enum-map",
@@ -3773,11 +3781,16 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "itertools 0.10.5",
+ "nix 0.27.1",
+ "postgres_backend",
  "postgres_ffi",
  "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "serde_with",
+ "storage_broker",
  "strum",
  "strum_macros",
  "thiserror",
diff --git a/Cargo.toml b/Cargo.toml
index fa949f9757..4fea3e8d80 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -103,6 +103,7 @@ humantime-serde = "1.1.1"
 hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
+indoc = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index cb28359ac3..8710904cec 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+# See pageserver/Cargo.toml
+testing = ["dep:nix"]
+
 [dependencies]
 serde.workspace = true
 serde_with.workspace = true
@@ -23,6 +27,12 @@ thiserror.workspace = true
 humantime-serde.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
+storage_broker.workspace = true
+camino = {workspace = true, features = ["serde1"]}
+remote_storage.workspace = true
+postgres_backend.workspace = true
+nix = {workspace = true, optional = true}
+reqwest.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index d996a62349..b2662c562a 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -1,15 +1,28 @@
-use std::collections::HashMap;
-
-use const_format::formatcp;
+use camino::Utf8PathBuf;
 
 #[cfg(test)]
 mod tests;
 
+use const_format::formatcp;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
 
+use postgres_backend::AuthType;
+use remote_storage::RemoteStorageConfig;
+use serde_with::serde_as;
+use std::{
+    collections::HashMap,
+    num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
+    time::Duration,
+};
+use utils::logging::LogFormat;
+
+use crate::models::ImageCompressionAlgorithm;
+use crate::models::LsnLease;
+
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
 // itself, it is only used for registering the pageserver with the control
@@ -29,3 +42,511 @@ pub struct NodeMetadata {
     #[serde(flatten)]
     pub other: HashMap<String, serde_json::Value>,
 }
+
+/// `pageserver.toml`
+///
+/// We use serde derive with `#[serde(default)]` to generate a deserializer
+/// that fills in the default values for each config field.
+///
+/// If there cannot be a static default value because we need to make runtime
+/// checks to determine the default, make it an `Option` (which defaults to None).
+/// The runtime check should be done in the consuming crate, i.e., `pageserver`.
+#[serde_as]
+#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct ConfigToml {
+    // types mapped 1:1 into the runtime PageServerConfig type
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,
+    pub availability_zone: Option<String>,
+    #[serde(with = "humantime_serde")]
+    pub wait_lsn_timeout: Duration,
+    #[serde(with = "humantime_serde")]
+    pub wal_redo_timeout: Duration,
+    pub superuser: String,
+    pub page_cache_size: usize,
+    pub max_file_descriptors: usize,
+    pub pg_distrib_dir: Option<Utf8PathBuf>,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub http_auth_type: AuthType,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub pg_auth_type: AuthType,
+    pub auth_validation_public_key_path: Option<Utf8PathBuf>,
+    pub remote_storage: Option<RemoteStorageConfig>,
+    pub tenant_config: TenantConfigToml,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub broker_endpoint: storage_broker::Uri,
+    #[serde(with = "humantime_serde")]
+    pub broker_keepalive_interval: Duration,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub log_format: LogFormat,
+    pub concurrent_tenant_warmup: NonZeroUsize,
+    pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
+    #[serde(with = "humantime_serde")]
+    pub metric_collection_interval: Duration,
+    pub metric_collection_endpoint: Option<reqwest::Url>,
+    pub metric_collection_bucket: Option<RemoteStorageConfig>,
+    #[serde(with = "humantime_serde")]
+    pub synthetic_size_calculation_interval: Duration,
+    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+    pub test_remote_failures: u64,
+    pub ondemand_download_behavior_treat_error_as_warn: bool,
+    #[serde(with = "humantime_serde")]
+    pub background_task_maximum_delay: Duration,
+    pub control_plane_api: Option<reqwest::Url>,
+    pub control_plane_api_token: Option<String>,
+    pub control_plane_emergency_mode: bool,
+    pub heatmap_upload_concurrency: usize,
+    pub secondary_download_concurrency: usize,
+    pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
+    pub ingest_batch_size: u64,
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+    pub image_compression: ImageCompressionAlgorithm,
+    pub ephemeral_bytes_per_memory_kb: usize,
+    pub l0_flush: Option<crate::models::L0FlushConfig>,
+    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
+    pub io_buffer_alignment: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct DiskUsageEvictionTaskConfig {
+    pub max_usage_pct: utils::serde_percent::Percent,
+    pub min_avail_bytes: u64,
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[cfg(feature = "testing")]
+    pub mock_statvfs: Option<statvfs::mock::Behavior>,
+    /// Select sorting for evicted layers
+    #[serde(default)]
+    pub eviction_order: EvictionOrder,
+}
+
+pub mod statvfs {
+    pub mod mock {
+        #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+        #[serde(tag = "type")]
+        pub enum Behavior {
+            Success {
+                blocksize: u64,
+                total_blocks: u64,
+                name_filter: Option<utils::serde_regex::Regex>,
+            },
+            #[cfg(feature = "testing")]
+            Failure { mocked_error: MockedError },
+        }
+
+        #[cfg(feature = "testing")]
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+        #[allow(clippy::upper_case_acronyms)]
+        pub enum MockedError {
+            EIO,
+        }
+
+        #[cfg(feature = "testing")]
+        impl From<MockedError> for nix::Error {
+            fn from(e: MockedError) -> Self {
+                match e {
+                    MockedError::EIO => nix::Error::EIO,
+                }
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "type", content = "args")]
+pub enum EvictionOrder {
+    RelativeAccessed {
+        highest_layer_count_loses_first: bool,
+    },
+}
+
+impl Default for EvictionOrder {
+    fn default() -> Self {
+        Self::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        }
+    }
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(transparent)]
+pub struct MaxVectoredReadBytes(pub NonZeroUsize);
+
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum CompactL0Phase1ValueAccess {
+    /// The old way.
+    PageCachedBlobIo,
+    /// The new way.
+    StreamingKmerge {
+        /// If set, we run both the old way and the new way, validate that
+        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
+        /// and if the validation fails,
+        /// - in tests: fail them with a panic or
+        /// - in prod, log a rate-limited warning and use the old way's results.
+        ///
+        /// If not set, we only run the new way and trust its results.
+        validate: Option<CompactL0BypassPageCacheValidation>,
+    },
+}
+
+/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum CompactL0BypassPageCacheValidation {
+    /// Validate that the series of (key, lsn) pairs are the same.
+    KeyLsn,
+    /// Validate that the entire output of old and new way is identical.
+    KeyLsnValue,
+}
+
+impl Default for CompactL0Phase1ValueAccess {
+    fn default() -> Self {
+        CompactL0Phase1ValueAccess::StreamingKmerge {
+            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
+            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
+        }
+    }
+}
+
+/// A tenant's calcuated configuration, which is the result of merging a
+/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
+///
+/// For storing and transmitting individual tenant's configuration, see
+/// TenantConfOpt.
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields, default)]
+pub struct TenantConfigToml {
+    // Flush out an inmemory layer, if it's holding WAL older than this
+    // This puts a backstop on how much WAL needs to be re-digested if the
+    // page server crashes.
+    // This parameter actually determines L0 layer file size.
+    pub checkpoint_distance: u64,
+    // Inmemory layer is also flushed at least once in checkpoint_timeout to
+    // eventually upload WAL after activity is stopped.
+    #[serde(with = "humantime_serde")]
+    pub checkpoint_timeout: Duration,
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub compaction_target_size: u64,
+    // How often to check if there's compaction work to be done.
+    // Duration::ZERO means automatic compaction is disabled.
+    #[serde(with = "humantime_serde")]
+    pub compaction_period: Duration,
+    // Level0 delta layer threshold for compaction.
+    pub compaction_threshold: usize,
+    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is #of bytes of WAL.
+    // Page versions older than this are garbage collected away.
+    pub gc_horizon: u64,
+    // Interval at which garbage collection is triggered.
+    // Duration::ZERO means automatic GC is disabled
+    #[serde(with = "humantime_serde")]
+    pub gc_period: Duration,
+    // Delta layer churn threshold to create L1 image layers.
+    pub image_creation_threshold: usize,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is time.
+    // Page versions older than this are garbage collected away.
+    #[serde(with = "humantime_serde")]
+    pub pitr_interval: Duration,
+    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Duration,
+    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
+    /// A stalled safekeeper will be changed to a newer one when it appears.
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Duration,
+    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
+    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
+    /// to avoid eager reconnects.
+    pub max_lsn_wal_lag: NonZeroU64,
+    pub eviction_policy: crate::models::EvictionPolicy,
+    pub min_resident_size_override: Option<u64>,
+    // See the corresponding metric's help string.
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Duration,
+
+    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
+    /// may be disabled if a Tenant will not have secondary locations: only secondary
+    /// locations will use the heatmap uploaded by attached locations.
+    #[serde(with = "humantime_serde")]
+    pub heatmap_period: Duration,
+
+    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
+    pub lazy_slru_download: bool,
+
+    pub timeline_get_throttle: crate::models::ThrottleConfig,
+
+    // How much WAL must be ingested before checking again whether a new image layer is required.
+    // Expresed in multiples of checkpoint distance.
+    pub image_layer_creation_check_threshold: u8,
+
+    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
+    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
+    pub switch_aux_file_policy: crate::models::AuxFilePolicy,
+
+    /// The length for an explicit LSN lease request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Duration,
+
+    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Duration,
+}
+
+pub mod defaults {
+    use crate::models::ImageCompressionAlgorithm;
+
+    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
+
+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
+    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
+
+    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
+
+    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
+    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
+
+    pub const DEFAULT_LOG_FORMAT: &str = "plain";
+
+    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
+
+    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
+
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
+    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
+
+    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+
+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::Zstd { level: Some(1) };
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
+
+    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
+
+    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
+}
+
+impl Default for ConfigToml {
+    fn default() -> Self {
+        use defaults::*;
+
+        Self {
+            listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
+            listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
+            availability_zone: (None),
+            wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
+                .expect("cannot parse default wait lsn timeout")),
+            wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
+                .expect("cannot parse default wal redo timeout")),
+            superuser: (DEFAULT_SUPERUSER.to_string()),
+            page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
+            max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
+            pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
+            http_auth_type: (AuthType::Trust),
+            pg_auth_type: (AuthType::Trust),
+            auth_validation_public_key_path: (None),
+            remote_storage: None,
+            broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
+                .parse()
+                .expect("failed to parse default broker endpoint")),
+            broker_keepalive_interval: (humantime::parse_duration(
+                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
+            )
+            .expect("cannot parse default keepalive interval")),
+            log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
+
+            concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                .expect("Invalid default constant")),
+            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
+            metric_collection_interval: (humantime::parse_duration(
+                DEFAULT_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default metric collection interval")),
+            synthetic_size_calculation_interval: (humantime::parse_duration(
+                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
+            )
+            .expect("cannot parse default synthetic size calculation interval")),
+            metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
+
+            metric_collection_bucket: (None),
+
+            disk_usage_based_eviction: (None),
+
+            test_remote_failures: (0),
+
+            ondemand_download_behavior_treat_error_as_warn: (false),
+
+            background_task_maximum_delay: (humantime::parse_duration(
+                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
+            )
+            .unwrap()),
+
+            control_plane_api: (None),
+            control_plane_api_token: (None),
+            control_plane_emergency_mode: (false),
+
+            heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+            secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
+
+            ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
+
+            virtual_file_io_engine: None,
+
+            max_vectored_read_bytes: (MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            image_compression: (DEFAULT_IMAGE_COMPRESSION),
+            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+            l0_flush: None,
+            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
+
+            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
+
+            tenant_config: TenantConfigToml::default(),
+        }
+    }
+}
+
+pub mod tenant_conf_defaults {
+
+    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
+    // would be more appropriate. But a low value forces the code to be exercised more,
+    // which is good for now to trigger bugs.
+    // This parameter actually determines L0 layer file size.
+    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
+    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
+
+    // FIXME the below configs are only used by legacy algorithm. The new algorithm
+    // has different parameters.
+
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
+
+    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
+    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
+        crate::models::CompactionAlgorithm::Legacy;
+
+    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
+    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
+    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
+    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
+    // throughputs up to 1GiB/s per timeline.
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+    // By default ingest enough WAL for two new L0 layers before checking if new image
+    // image layers should be created.
+    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+}
+
+impl Default for TenantConfigToml {
+    fn default() -> Self {
+        use tenant_conf_defaults::*;
+        Self {
+            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
+                .expect("cannot parse default checkpoint timeout"),
+            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
+            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
+                .expect("cannot parse default compaction period"),
+            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
+            gc_horizon: DEFAULT_GC_HORIZON,
+            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
+                .expect("cannot parse default gc period"),
+            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
+                .expect("cannot parse default PITR interval"),
+            walreceiver_connect_timeout: humantime::parse_duration(
+                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
+            )
+            .expect("cannot parse default walreceiver connect timeout"),
+            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
+                .expect("cannot parse default walreceiver lagging wal timeout"),
+            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
+                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            eviction_policy: crate::models::EvictionPolicy::NoEviction,
+            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            heatmap_period: Duration::ZERO,
+            lazy_slru_download: false,
+            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
+            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
+            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
+            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
+        }
+    }
+}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 87e8f8305a..d13d04eb1b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,6 +6,7 @@ pub use utilization::PageserverUtilization;
 
 use std::{
     collections::HashMap,
+    fmt::Display,
     io::{BufRead, Read},
     num::{NonZeroU32, NonZeroU64, NonZeroUsize},
     str::FromStr,
@@ -435,7 +436,9 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
+)]
 pub enum ImageCompressionAlgorithm {
     // Disabled for writes, support decompressing during read path
     Disabled,
@@ -470,11 +473,33 @@ impl FromStr for ImageCompressionAlgorithm {
     }
 }
 
+impl Display for ImageCompressionAlgorithm {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
+            ImageCompressionAlgorithm::Zstd { level } => {
+                if let Some(level) = level {
+                    write!(f, "zstd({})", level)
+                } else {
+                    write!(f, "zstd")
+                }
+            }
+        }
+    }
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum L0FlushConfig {
+    #[serde(rename_all = "snake_case")]
+    Direct { max_concurrency: NonZeroUsize },
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
@@ -1656,21 +1681,33 @@ mod tests {
     #[test]
     fn test_image_compression_algorithm_parsing() {
         use ImageCompressionAlgorithm::*;
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
-            Disabled
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
-            Zstd { level: None }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
-            Zstd { level: Some(18) }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
-            Zstd { level: Some(-3) }
-        );
+        let cases = [
+            ("disabled", Disabled),
+            ("zstd", Zstd { level: None }),
+            ("zstd(18)", Zstd { level: Some(18) }),
+            ("zstd(-3)", Zstd { level: Some(-3) }),
+        ];
+
+        for (display, expected) in cases {
+            assert_eq!(
+                ImageCompressionAlgorithm::from_str(display).unwrap(),
+                expected,
+                "parsing works"
+            );
+            assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
+
+            let ser = serde_json::to_string(&expected).expect("serialization");
+            assert_eq!(
+                serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
+                expected,
+                "serde roundtrip"
+            );
+
+            assert_eq!(
+                serde_json::Value::String(display.to_string()),
+                serde_json::to_value(expected).unwrap(),
+                "Display is the serde serialization"
+            );
+        }
     }
 }
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index fa3f2cba58..f819a1572a 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -235,6 +235,31 @@ timeout = '5s'";
         );
     }
 
+    #[test]
+    fn test_storage_class_serde_roundtrip() {
+        let classes = [
+            None,
+            Some(StorageClass::Standard),
+            Some(StorageClass::IntelligentTiering),
+        ];
+        for class in classes {
+            #[derive(Serialize, Deserialize)]
+            struct Wrapper {
+                #[serde(
+                    deserialize_with = "deserialize_storage_class",
+                    serialize_with = "serialize_storage_class"
+                )]
+                class: Option<StorageClass>,
+            }
+            let wrapped = Wrapper {
+                class: class.clone(),
+            };
+            let serialized = serde_json::to_string(&wrapped).unwrap();
+            let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap();
+            assert_eq!(class, deserialized.class);
+        }
+    }
+
     #[test]
     fn test_azure_parsing() {
         let toml = "\
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index f7b73dc984..71af43a4da 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,7 +5,9 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};
 
-#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[derive(
+    EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
+)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
     Plain,
@@ -274,6 +276,14 @@ impl From<String> for SecretString {
     }
 }
 
+impl FromStr for SecretString {
+    type Err = std::convert::Infallible;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(Self(s.to_string()))
+    }
+}
+
 impl std::fmt::Debug for SecretString {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "[SECRET]")
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9c02ce3fbc..24373afca3 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
+testing = ["fail/failpoints", "pageserver_api/testing" ]
 
 [dependencies]
 anyhow.workspace = true
@@ -101,6 +101,7 @@ procfs.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
+indoc.workspace = true
 
 [[bench]]
 name = "bench_layer_map"
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 1be4391d81..72cbb6beab 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,7 +4,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
+    config::PageServerConf,
     context::{DownloadBehavior, RequestContext},
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
@@ -167,7 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     virtual_file::init(
         16384,
         virtual_file::io_engine_for_bench(),
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     );
     page_cache::init(conf.page_cache_size);
 
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 8092c203c3..a07107753e 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,7 +4,6 @@
 
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -148,7 +147,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     pageserver::virtual_file::init(
         10,
         virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     );
     pageserver::page_cache::init(100);
 
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index e0f978eaa2..dd753398e2 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -194,7 +193,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             pageserver::virtual_file::init(
                 10,
                 virtual_file::api::IoEngineKind::StdFs,
-                DEFAULT_IO_BUFFER_ALIGNMENT,
+                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
             );
             pageserver::page_cache::init(100);
 
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 7a6c7675bb..3b66b0c4aa 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,14 +20,13 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
-    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     context::{DownloadBehavior, RequestContext},
     page_cache,
     task_mgr::TaskKind,
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
     virtual_file,
 };
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 850bd87b95..2c60e8d7d1 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -5,6 +5,7 @@
 use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -223,27 +224,15 @@ fn initialize_config(
         }
     };
 
-    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
-        Ok(mut f) => {
-            let md = f.metadata().context("stat config file")?;
-            if md.is_file() {
-                let mut s = String::new();
-                f.read_to_string(&mut s).context("read config file")?;
-                s.parse().context("parse config file toml")?
-            } else {
-                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
-            }
-        }
-        Err(e) => {
-            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
-        }
-    };
-
-    debug!("Using pageserver toml: {config}");
-
-    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
-        .context("Failed to parse pageserver configuration")?;
+    let config_file_contents =
+        std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
+    let config_toml = serde_path_to_error::deserialize(
+        toml_edit::de::Deserializer::from_str(&config_file_contents)
+            .context("build toml deserializer")?,
+    )
+    .context("deserialize config toml")?;
+    let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
+        .context("runtime-validation of config toml")?;
 
     Ok(Box::leak(Box::new(conf)))
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9e4530ba3c..c159b66905 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -4,11 +4,13 @@
 //! file, or on the command line.
 //! See also `settings.md` for better description on every parameter.
 
-use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
+use anyhow::{bail, ensure, Context};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use pageserver_api::{
+    config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes},
+    shard::TenantShardId,
+};
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use serde::de::IntoDeserializer;
-use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -17,10 +19,8 @@ use utils::logging::SecretString;
 use once_cell::sync::OnceCell;
 use reqwest::Url;
 use std::num::NonZeroUsize;
-use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use toml_edit::{Document, Item};
 
 use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
@@ -29,139 +29,27 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::l0_flush::L0FlushConfig;
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
-use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
-use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{tenant::config::TenantConf, virtual_file};
+use crate::virtual_file;
+use crate::virtual_file::io_engine;
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
-use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
-
-use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
-
-pub mod defaults {
-    use crate::tenant::config::defaults::*;
-    use const_format::formatcp;
-
-    pub use pageserver_api::config::{
-        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
-        DEFAULT_PG_LISTEN_PORT,
-    };
-    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
-
-    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
-    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
-
-    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-
-    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
-    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
-
-    pub const DEFAULT_LOG_FORMAT: &str = "plain";
-
-    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
-
-    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
-        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
-
-    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
-    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
-
-    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-
-    #[cfg(target_os = "linux")]
-    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
-
-    #[cfg(not(target_os = "linux"))]
-    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
-
-    pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
-
-    pub const DEFAULT_GET_IMPL: &str = "vectored";
-
-    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
-
-    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
-
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
-
-    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-
-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
-
-    ///
-    /// Default built-in configuration file.
-    ///
-    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
-        r#"
-# Initial configuration file created by 'pageserver --init'
-#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
-#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
-
-#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
-#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
-
-#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
-#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
-
-# initial superuser role name to use when creating a new tenant
-#initial_superuser_name = '{DEFAULT_SUPERUSER}'
-
-#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
-
-#log_format = '{DEFAULT_LOG_FORMAT}'
-
-#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
-#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
-
-#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
-#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
-
-#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
-
-#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
-
-#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
-
-#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
-
-#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
-
-[tenant_config]
-#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
-#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
-#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
-#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
-
-#gc_period = '{DEFAULT_GC_PERIOD}'
-#gc_horizon = {DEFAULT_GC_HORIZON}
-#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
-#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
-
-#min_resident_size_override = .. # in bytes
-#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-
-#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
-#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
-
-#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
-
-#[remote_storage]
-
-"#
-    );
-}
-
+/// Global state of pageserver.
+///
+/// It's mostly immutable configuration, but some semaphores and the
+/// like crept in over time and the name stuck.
+///
+/// Instantiated by deserializing `pageserver.toml` into  [`pageserver_api::config::ConfigToml`]
+/// and passing that to [`PageServerConf::parse_and_validate`].
+///
+/// # Adding a New Field
+///
+/// 1. Add the field to `pageserver_api::config::ConfigToml`.
+/// 2. Fix compiler errors (exhaustive destructuring will guide you).
+///
+/// For fields that require additional validation or filling in of defaults at runtime,
+/// check for examples in the [`PageServerConf::parse_and_validate`] method.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct PageServerConf {
     // Identifier of that particular pageserver so e g safekeepers
@@ -207,7 +95,7 @@ pub struct PageServerConf {
 
     pub remote_storage_config: Option<RemoteStorageConfig>,
 
-    pub default_tenant_conf: TenantConf,
+    pub default_tenant_conf: crate::tenant::config::TenantConf,
 
     /// Storage broker endpoints to connect to.
     pub broker_endpoint: Uri,
@@ -284,11 +172,11 @@ pub struct PageServerConf {
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
 
-    pub l0_flush: L0FlushConfig,
+    pub l0_flush: crate::l0_flush::L0FlushConfig,
 
     /// This flag is temporary and will be removed after gradual rollout.
     /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
 
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
@@ -304,472 +192,6 @@ pub struct PageServerConf {
 /// startup code to the connection code through a dozen layers.
 pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
-// use dedicated enum for builder to better indicate the intention
-// and avoid possible confusion with nested options
-#[derive(Clone, Default)]
-pub enum BuilderValue<T> {
-    Set(T),
-    #[default]
-    NotSet,
-}
-
-impl<T: Clone> BuilderValue<T> {
-    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
-        match self {
-            Self::Set(v) => Ok(v.clone()),
-            Self::NotSet => match default {
-                BuilderValue::Set(v) => Ok(v.clone()),
-                BuilderValue::NotSet => {
-                    anyhow::bail!("missing config value {field_name:?}")
-                }
-            },
-        }
-    }
-}
-
-// needed to simplify config construction
-#[derive(Default)]
-struct PageServerConfigBuilder {
-    listen_pg_addr: BuilderValue<String>,
-
-    listen_http_addr: BuilderValue<String>,
-
-    availability_zone: BuilderValue<Option<String>>,
-
-    wait_lsn_timeout: BuilderValue<Duration>,
-    wal_redo_timeout: BuilderValue<Duration>,
-
-    superuser: BuilderValue<String>,
-
-    page_cache_size: BuilderValue<usize>,
-    max_file_descriptors: BuilderValue<usize>,
-
-    workdir: BuilderValue<Utf8PathBuf>,
-
-    pg_distrib_dir: BuilderValue<Utf8PathBuf>,
-
-    http_auth_type: BuilderValue<AuthType>,
-    pg_auth_type: BuilderValue<AuthType>,
-
-    //
-    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
-    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
-
-    broker_endpoint: BuilderValue<Uri>,
-    broker_keepalive_interval: BuilderValue<Duration>,
-
-    log_format: BuilderValue<LogFormat>,
-
-    concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
-    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
-
-    metric_collection_interval: BuilderValue<Duration>,
-    metric_collection_endpoint: BuilderValue<Option<Url>>,
-    synthetic_size_calculation_interval: BuilderValue<Duration>,
-    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
-
-    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
-
-    test_remote_failures: BuilderValue<u64>,
-
-    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
-
-    background_task_maximum_delay: BuilderValue<Duration>,
-
-    control_plane_api: BuilderValue<Option<Url>>,
-    control_plane_api_token: BuilderValue<Option<SecretString>>,
-    control_plane_emergency_mode: BuilderValue<bool>,
-
-    heatmap_upload_concurrency: BuilderValue<usize>,
-    secondary_download_concurrency: BuilderValue<usize>,
-
-    ingest_batch_size: BuilderValue<u64>,
-
-    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
-
-    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
-
-    image_compression: BuilderValue<ImageCompressionAlgorithm>,
-
-    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
-
-    l0_flush: BuilderValue<L0FlushConfig>,
-
-    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
-
-    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
-
-    io_buffer_alignment: BuilderValue<usize>,
-}
-
-impl PageServerConfigBuilder {
-    fn new() -> Self {
-        Self::default()
-    }
-
-    #[inline(always)]
-    fn default_values() -> Self {
-        use self::BuilderValue::*;
-        use defaults::*;
-        Self {
-            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
-            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
-            availability_zone: Set(None),
-            wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
-                .expect("cannot parse default wait lsn timeout")),
-            wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
-                .expect("cannot parse default wal redo timeout")),
-            superuser: Set(DEFAULT_SUPERUSER.to_string()),
-            page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
-            max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
-            workdir: Set(Utf8PathBuf::new()),
-            pg_distrib_dir: Set(Utf8PathBuf::from_path_buf(
-                env::current_dir().expect("cannot access current directory"),
-            )
-            .expect("non-Unicode path")
-            .join("pg_install")),
-            http_auth_type: Set(AuthType::Trust),
-            pg_auth_type: Set(AuthType::Trust),
-            auth_validation_public_key_path: Set(None),
-            remote_storage_config: Set(None),
-            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
-                .parse()
-                .expect("failed to parse default broker endpoint")),
-            broker_keepalive_interval: Set(humantime::parse_duration(
-                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
-            )
-            .expect("cannot parse default keepalive interval")),
-            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
-
-            concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: Set(
-                ConfigurableSemaphore::DEFAULT_INITIAL,
-            ),
-            metric_collection_interval: Set(humantime::parse_duration(
-                DEFAULT_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default metric collection interval")),
-            synthetic_size_calculation_interval: Set(humantime::parse_duration(
-                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
-            )
-            .expect("cannot parse default synthetic size calculation interval")),
-            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
-
-            metric_collection_bucket: Set(None),
-
-            disk_usage_based_eviction: Set(None),
-
-            test_remote_failures: Set(0),
-
-            ondemand_download_behavior_treat_error_as_warn: Set(false),
-
-            background_task_maximum_delay: Set(humantime::parse_duration(
-                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
-            )
-            .unwrap()),
-
-            control_plane_api: Set(None),
-            control_plane_api_token: Set(None),
-            control_plane_emergency_mode: Set(false),
-
-            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
-
-            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
-
-            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
-
-            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
-                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
-            )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
-            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-            l0_flush: Set(L0FlushConfig::default()),
-            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
-            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
-            io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
-        }
-    }
-}
-
-impl PageServerConfigBuilder {
-    pub fn listen_pg_addr(&mut self, listen_pg_addr: String) {
-        self.listen_pg_addr = BuilderValue::Set(listen_pg_addr)
-    }
-
-    pub fn listen_http_addr(&mut self, listen_http_addr: String) {
-        self.listen_http_addr = BuilderValue::Set(listen_http_addr)
-    }
-
-    pub fn availability_zone(&mut self, availability_zone: Option<String>) {
-        self.availability_zone = BuilderValue::Set(availability_zone)
-    }
-
-    pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
-        self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
-    }
-
-    pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) {
-        self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout)
-    }
-
-    pub fn superuser(&mut self, superuser: String) {
-        self.superuser = BuilderValue::Set(superuser)
-    }
-
-    pub fn page_cache_size(&mut self, page_cache_size: usize) {
-        self.page_cache_size = BuilderValue::Set(page_cache_size)
-    }
-
-    pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) {
-        self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
-    }
-
-    pub fn workdir(&mut self, workdir: Utf8PathBuf) {
-        self.workdir = BuilderValue::Set(workdir)
-    }
-
-    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) {
-        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
-    }
-
-    pub fn http_auth_type(&mut self, auth_type: AuthType) {
-        self.http_auth_type = BuilderValue::Set(auth_type)
-    }
-
-    pub fn pg_auth_type(&mut self, auth_type: AuthType) {
-        self.pg_auth_type = BuilderValue::Set(auth_type)
-    }
-
-    pub fn auth_validation_public_key_path(
-        &mut self,
-        auth_validation_public_key_path: Option<Utf8PathBuf>,
-    ) {
-        self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
-    }
-
-    pub fn remote_storage_config(&mut self, remote_storage_config: Option<RemoteStorageConfig>) {
-        self.remote_storage_config = BuilderValue::Set(remote_storage_config)
-    }
-
-    pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
-        self.broker_endpoint = BuilderValue::Set(broker_endpoint)
-    }
-
-    pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
-        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
-    }
-
-    pub fn log_format(&mut self, log_format: LogFormat) {
-        self.log_format = BuilderValue::Set(log_format)
-    }
-
-    pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
-        self.concurrent_tenant_warmup = BuilderValue::Set(u);
-    }
-
-    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
-        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
-    }
-
-    pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
-        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
-    }
-
-    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
-        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
-    }
-
-    pub fn metric_collection_bucket(
-        &mut self,
-        metric_collection_bucket: Option<RemoteStorageConfig>,
-    ) {
-        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
-    }
-
-    pub fn synthetic_size_calculation_interval(
-        &mut self,
-        synthetic_size_calculation_interval: Duration,
-    ) {
-        self.synthetic_size_calculation_interval =
-            BuilderValue::Set(synthetic_size_calculation_interval)
-    }
-
-    pub fn test_remote_failures(&mut self, fail_first: u64) {
-        self.test_remote_failures = BuilderValue::Set(fail_first);
-    }
-
-    pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
-        self.disk_usage_based_eviction = BuilderValue::Set(value);
-    }
-
-    pub fn ondemand_download_behavior_treat_error_as_warn(
-        &mut self,
-        ondemand_download_behavior_treat_error_as_warn: bool,
-    ) {
-        self.ondemand_download_behavior_treat_error_as_warn =
-            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
-    }
-
-    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
-        self.background_task_maximum_delay = BuilderValue::Set(delay);
-    }
-
-    pub fn control_plane_api(&mut self, api: Option<Url>) {
-        self.control_plane_api = BuilderValue::Set(api)
-    }
-
-    pub fn control_plane_api_token(&mut self, token: Option<SecretString>) {
-        self.control_plane_api_token = BuilderValue::Set(token)
-    }
-
-    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
-        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
-    }
-
-    pub fn heatmap_upload_concurrency(&mut self, value: usize) {
-        self.heatmap_upload_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn secondary_download_concurrency(&mut self, value: usize) {
-        self.secondary_download_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
-        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
-    }
-
-    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
-        self.virtual_file_io_engine = BuilderValue::Set(value);
-    }
-
-    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
-        self.max_vectored_read_bytes = BuilderValue::Set(value);
-    }
-
-    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
-        self.image_compression = BuilderValue::Set(value);
-    }
-
-    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
-        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
-    }
-
-    pub fn l0_flush(&mut self, value: L0FlushConfig) {
-        self.l0_flush = BuilderValue::Set(value);
-    }
-
-    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
-        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
-    }
-
-    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
-        self.virtual_file_direct_io = BuilderValue::Set(value);
-    }
-
-    pub fn io_buffer_alignment(&mut self, value: usize) {
-        self.io_buffer_alignment = BuilderValue::Set(value);
-    }
-
-    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
-        let default = Self::default_values();
-
-        macro_rules! conf {
-            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
-                PageServerConf {
-                    $(
-                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
-                    )*
-                    $(
-                        $custom_field: $custom_value,
-                    )*
-                }
-            };
-        }
-
-        Ok(conf!(
-            USING DEFAULT
-            {
-                listen_pg_addr,
-                listen_http_addr,
-                availability_zone,
-                wait_lsn_timeout,
-                wal_redo_timeout,
-                superuser,
-                page_cache_size,
-                max_file_descriptors,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type,
-                pg_auth_type,
-                auth_validation_public_key_path,
-                remote_storage_config,
-                broker_endpoint,
-                broker_keepalive_interval,
-                log_format,
-                metric_collection_interval,
-                metric_collection_endpoint,
-                metric_collection_bucket,
-                synthetic_size_calculation_interval,
-                disk_usage_based_eviction,
-                test_remote_failures,
-                ondemand_download_behavior_treat_error_as_warn,
-                background_task_maximum_delay,
-                control_plane_api,
-                control_plane_api_token,
-                control_plane_emergency_mode,
-                heatmap_upload_concurrency,
-                secondary_download_concurrency,
-                ingest_batch_size,
-                max_vectored_read_bytes,
-                image_compression,
-                ephemeral_bytes_per_memory_kb,
-                l0_flush,
-                compact_level0_phase1_value_access,
-                virtual_file_direct_io,
-                io_buffer_alignment,
-            }
-            CUSTOM LOGIC
-            {
-                id: id,
-                // TenantConf is handled separately
-                default_tenant_conf: TenantConf::default(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new({
-                    self
-                        .concurrent_tenant_warmup
-                        .ok_or("concurrent_tenant_warmpup",
-                               default.concurrent_tenant_warmup)?
-                }),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("concurrent_tenant_size_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?
-                ),
-                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                    // re-use `concurrent_tenant_size_logical_size_queries`
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?,
-                ),
-                virtual_file_io_engine: match self.virtual_file_io_engine {
-                    BuilderValue::Set(v) => v,
-                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
-                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
-                        io_engine::FeatureTestResult::Worse { engine, remark } => {
-                            // TODO: bubble this up to the caller so we can tracing::warn! it.
-                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
-                            engine
-                        }
-                    },
-                },
-            }
-        ))
-    }
-}
-
 impl PageServerConf {
     //
     // Repository paths, relative to workdir.
@@ -878,134 +300,135 @@ impl PageServerConf {
     ///
     /// This leaves any options not present in the file in the built-in defaults.
     pub fn parse_and_validate(
-        node_id: NodeId,
-        toml: &Document,
+        id: NodeId,
+        config_toml: pageserver_api::config::ConfigToml,
         workdir: &Utf8Path,
     ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new();
-        builder.workdir(workdir.to_owned());
+        let pageserver_api::config::ConfigToml {
+            listen_pg_addr,
+            listen_http_addr,
+            availability_zone,
+            wait_lsn_timeout,
+            wal_redo_timeout,
+            superuser,
+            page_cache_size,
+            max_file_descriptors,
+            pg_distrib_dir,
+            http_auth_type,
+            pg_auth_type,
+            auth_validation_public_key_path,
+            remote_storage,
+            broker_endpoint,
+            broker_keepalive_interval,
+            log_format,
+            metric_collection_interval,
+            metric_collection_endpoint,
+            metric_collection_bucket,
+            synthetic_size_calculation_interval,
+            disk_usage_based_eviction,
+            test_remote_failures,
+            ondemand_download_behavior_treat_error_as_warn,
+            background_task_maximum_delay,
+            control_plane_api,
+            control_plane_api_token,
+            control_plane_emergency_mode,
+            heatmap_upload_concurrency,
+            secondary_download_concurrency,
+            ingest_batch_size,
+            max_vectored_read_bytes,
+            image_compression,
+            ephemeral_bytes_per_memory_kb,
+            compact_level0_phase1_value_access,
+            l0_flush,
+            virtual_file_direct_io,
+            concurrent_tenant_warmup,
+            concurrent_tenant_size_logical_size_queries,
+            virtual_file_io_engine,
+            io_buffer_alignment,
+            tenant_config,
+        } = config_toml;
 
-        let mut t_conf = TenantConfOpt::default();
+        let mut conf = PageServerConf {
+            // ------------------------------------------------------------
+            // fields that are already fully validated by the ConfigToml Deserialize impl
+            // ------------------------------------------------------------
+            listen_pg_addr,
+            listen_http_addr,
+            availability_zone,
+            wait_lsn_timeout,
+            wal_redo_timeout,
+            superuser,
+            page_cache_size,
+            max_file_descriptors,
+            http_auth_type,
+            pg_auth_type,
+            auth_validation_public_key_path,
+            remote_storage_config: remote_storage,
+            broker_endpoint,
+            broker_keepalive_interval,
+            log_format,
+            metric_collection_interval,
+            metric_collection_endpoint,
+            metric_collection_bucket,
+            synthetic_size_calculation_interval,
+            disk_usage_based_eviction,
+            test_remote_failures,
+            ondemand_download_behavior_treat_error_as_warn,
+            background_task_maximum_delay,
+            control_plane_api,
+            control_plane_emergency_mode,
+            heatmap_upload_concurrency,
+            secondary_download_concurrency,
+            ingest_batch_size,
+            max_vectored_read_bytes,
+            image_compression,
+            ephemeral_bytes_per_memory_kb,
+            compact_level0_phase1_value_access,
+            virtual_file_direct_io,
+            io_buffer_alignment,
 
-        for (key, item) in toml.iter() {
-            match key {
-                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
-                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
-                "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)),
-                "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
-                "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
-                "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
-                "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize),
-                "max_file_descriptors" => {
-                    builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
-                }
-                "pg_distrib_dir" => {
-                    builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?))
-                }
-                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
-                    Utf8PathBuf::from(parse_toml_string(key, item)?),
-                )),
-                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
-                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
-                "remote_storage" => {
-                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
-                }
-                "tenant_config" => {
-                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
-                }
-                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
-                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
-                "log_format" => builder.log_format(
-                    LogFormat::from_config(&parse_toml_string(key, item)?)?
-                ),
-                "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
-                }),
-                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
-                }),
-                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
-                "metric_collection_endpoint" => {
-                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
-                    builder.metric_collection_endpoint(Some(endpoint));
-                },
-                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
-                }
-                "synthetic_size_calculation_interval" =>
-                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
-                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
-                "disk_usage_based_eviction" => {
-                    tracing::info!("disk_usage_based_eviction: {:#?}", &item);
-                    builder.disk_usage_based_eviction(
-                        deserialize_from_item("disk_usage_based_eviction", item)
-                            .context("parse disk_usage_based_eviction")?
-                    )
-                },
-                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
-                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => {
-                    let parsed = parse_toml_string(key, item)?;
-                    if parsed.is_empty() {
-                        builder.control_plane_api(None)
-                    } else {
-                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+            // ------------------------------------------------------------
+            // fields that require additional validation or custom handling
+            // ------------------------------------------------------------
+            workdir: workdir.to_owned(),
+            pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| {
+                std::env::current_dir()
+                    .expect("current_dir() failed")
+                    .try_into()
+                    .expect("current_dir() is not a valid Utf8Path")
+            }),
+            control_plane_api_token: control_plane_api_token.map(SecretString::from),
+            id,
+            default_tenant_conf: tenant_config,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                // re-use `concurrent_tenant_size_logical_size_queries`
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            virtual_file_io_engine: match virtual_file_io_engine {
+                Some(v) => v,
+                None => match crate::virtual_file::io_engine_feature_test()
+                    .context("auto-detect virtual_file_io_engine")?
+                {
+                    io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
+                    io_engine::FeatureTestResult::Worse { engine, remark } => {
+                        // TODO: bubble this up to the caller so we can tracing::warn! it.
+                        eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
+                        engine
                     }
                 },
-                "control_plane_api_token" => {
-                    let parsed = parse_toml_string(key, item)?;
-                    if parsed.is_empty() {
-                        builder.control_plane_api_token(None)
-                    } else {
-                        builder.control_plane_api_token(Some(parsed.into()))
-                    }
-                },
-                "control_plane_emergency_mode" => {
-                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-                },
-                "heatmap_upload_concurrency" => {
-                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "secondary_download_concurrency" => {
-                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
-                "virtual_file_io_engine" => {
-                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
-                }
-                "max_vectored_read_bytes" => {
-                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
-                    builder.get_max_vectored_read_bytes(
-                        MaxVectoredReadBytes(
-                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
-                }
-                "image_compression" => {
-                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
-                }
-                "ephemeral_bytes_per_memory_kb" => {
-                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
-                }
-                "l0_flush" => {
-                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
-                }
-                "compact_level0_phase1_value_access" => {
-                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
-                }
-                "virtual_file_direct_io" => {
-                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
-                }
-                "io_buffer_alignment" => {
-                    builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
-                }
-                _ => bail!("unrecognized pageserver option '{key}'"),
-            }
-        }
+            },
+            l0_flush: l0_flush
+                .map(crate::l0_flush::L0FlushConfig::from)
+                .unwrap_or_default(),
+        };
 
-        let mut conf = builder.build(node_id).context("invalid config")?;
+        // ------------------------------------------------------------
+        // custom validation code that covers more than one field in isolation
+        // ------------------------------------------------------------
 
         if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
             let auth_validation_public_key_path = conf
@@ -1019,10 +442,8 @@ impl PageServerConf {
             );
         }
 
-        conf.default_tenant_conf = t_conf.merge(TenantConf::default());
-
         IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
-            .map_err(|msg| anyhow::anyhow!("{msg}"))
+            .map_err(anyhow::Error::msg)
             .with_context(|| {
                 format!(
                     "effective checkpoint distance is unsupported: {}",
@@ -1042,130 +463,25 @@ impl PageServerConf {
     pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
         let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
-        PageServerConf {
-            id: NodeId(0),
+        let config_toml = pageserver_api::config::ConfigToml {
             wait_lsn_timeout: Duration::from_secs(60),
             wal_redo_timeout: Duration::from_secs(60),
-            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
-            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
-            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-            availability_zone: None,
-            superuser: "cloud_admin".to_string(),
-            workdir: repo_dir,
-            pg_distrib_dir,
-            http_auth_type: AuthType::Trust,
-            pg_auth_type: AuthType::Trust,
-            auth_validation_public_key_path: None,
-            remote_storage_config: None,
-            default_tenant_conf: TenantConf::default(),
-            broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-            broker_keepalive_interval: Duration::from_secs(5000),
-            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                    .expect("Invalid default constant"),
-            ),
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
-            ),
+            pg_distrib_dir: Some(pg_distrib_dir),
             metric_collection_interval: Duration::from_secs(60),
-            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-            metric_collection_bucket: None,
             synthetic_size_calculation_interval: Duration::from_secs(60),
-            disk_usage_based_eviction: None,
-            test_remote_failures: 0,
-            ondemand_download_behavior_treat_error_as_warn: false,
             background_task_maximum_delay: Duration::ZERO,
-            control_plane_api: None,
-            control_plane_api_token: None,
-            control_plane_emergency_mode: false,
-            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-            max_vectored_read_bytes: MaxVectoredReadBytes(
-                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                    .expect("Invalid default constant"),
-            ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-            l0_flush: L0FlushConfig::default(),
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-            io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-        }
+            ..Default::default()
+        };
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
     }
 }
 
-#[derive(Deserialize)]
+#[derive(serde::Deserialize, serde::Serialize)]
 #[serde(deny_unknown_fields)]
 pub struct PageserverIdentity {
     pub id: NodeId,
 }
 
-// Helper functions to parse a toml Item
-
-fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    Ok(s.to_string())
-}
-
-fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
-    // A toml integer is signed, so it cannot represent the full range of an u64. That's OK
-    // for our use, though.
-    let i: i64 = item
-        .as_integer()
-        .with_context(|| format!("configure option {name} is not an integer"))?;
-    if i < 0 {
-        bail!("configure option {name} cannot be negative");
-    }
-    Ok(i as u64)
-}
-
-fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
-    item.as_bool()
-        .with_context(|| format!("configure option {name} is not a bool"))
-}
-
-fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-
-    Ok(humantime::parse_duration(s)?)
-}
-
-fn parse_toml_from_str<T>(name: &str, item: &Item) -> anyhow::Result<T>
-where
-    T: FromStr,
-    <T as FromStr>::Err: std::fmt::Display,
-{
-    let v = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    T::from_str(v).map_err(|e| {
-        anyhow!(
-            "Failed to parse string as {parse_type} for configure option {name}: {e}",
-            parse_type = stringify!(T)
-        )
-    })
-}
-
-fn deserialize_from_item<T>(name: &str, item: &Item) -> anyhow::Result<T>
-where
-    T: serde::de::DeserializeOwned,
-{
-    // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way
-    let deserializer = match item.clone().into_value() {
-        Ok(value) => value.into_deserializer(),
-        Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"),
-    };
-    T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}"))
-}
-
 /// Configurable semaphore permits setting.
 ///
 /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -1227,469 +543,108 @@ impl ConfigurableSemaphore {
 
 #[cfg(test)]
 mod tests {
-    use std::{fs, num::NonZeroU32};
 
-    use camino_tempfile::{tempdir, Utf8TempDir};
-    use pageserver_api::models::EvictionPolicy;
-    use remote_storage::{RemoteStorageKind, S3Config};
-    use utils::serde_percent::Percent;
+    use camino::Utf8PathBuf;
+    use utils::id::NodeId;
 
-    use super::*;
-    use crate::DEFAULT_PG_VERSION;
-
-    const ALL_BASE_VALUES_TOML: &str = r#"
-# Initial configuration file created by 'pageserver --init'
-
-listen_pg_addr = '127.0.0.1:64000'
-listen_http_addr = '127.0.0.1:9898'
-
-wait_lsn_timeout = '111 s'
-wal_redo_timeout = '111 s'
-
-page_cache_size = 444
-max_file_descriptors = 333
-
-# initial superuser role name to use when creating a new tenant
-initial_superuser_name = 'zzzz'
-
-metric_collection_interval = '222 s'
-metric_collection_endpoint = 'http://localhost:80/metrics'
-synthetic_size_calculation_interval = '333 s'
-
-log_format = 'json'
-background_task_maximum_delay = '334 s'
-
-"#;
+    use super::PageServerConf;
 
     #[test]
-    fn parse_defaults() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
-        // we have to create dummy values to overcome the validation errors
-        let config_string =
-            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
-        let toml = config_string.parse()?;
-
-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
-
-        assert_eq!(
-            parsed_config,
-            PageServerConf {
-                id: NodeId(10),
-                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
-                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-                availability_zone: None,
-                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
-                wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
-                superuser: defaults::DEFAULT_SUPERUSER.to_string(),
-                page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-                max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type: AuthType::Trust,
-                pg_auth_type: AuthType::Trust,
-                auth_validation_public_key_path: None,
-                remote_storage_config: None,
-                default_tenant_conf: TenantConf::default(),
-                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-                broker_keepalive_interval: humantime::parse_duration(
-                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
-                )?,
-                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-                eviction_task_immitated_concurrent_logical_size_queries:
-                    ConfigurableSemaphore::default(),
-                metric_collection_interval: humantime::parse_duration(
-                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
-                )?,
-                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-                metric_collection_bucket: None,
-                synthetic_size_calculation_interval: humantime::parse_duration(
-                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
-                )?,
-                disk_usage_based_eviction: None,
-                test_remote_failures: 0,
-                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: humantime::parse_duration(
-                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
-                )?,
-                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            },
-            "Correct defaults should be used when no config values are provided"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_basic_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
-
-        let config_string = format!(
-            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",
-        );
-        let toml = config_string.parse()?;
-
-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
-
-        assert_eq!(
-            parsed_config,
-            PageServerConf {
-                id: NodeId(10),
-                listen_pg_addr: "127.0.0.1:64000".to_string(),
-                listen_http_addr: "127.0.0.1:9898".to_string(),
-                availability_zone: None,
-                wait_lsn_timeout: Duration::from_secs(111),
-                wal_redo_timeout: Duration::from_secs(111),
-                superuser: "zzzz".to_string(),
-                page_cache_size: 444,
-                max_file_descriptors: 333,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type: AuthType::Trust,
-                pg_auth_type: AuthType::Trust,
-                auth_validation_public_key_path: None,
-                remote_storage_config: None,
-                default_tenant_conf: TenantConf::default(),
-                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-                broker_keepalive_interval: Duration::from_secs(5),
-                log_format: LogFormat::Json,
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-                eviction_task_immitated_concurrent_logical_size_queries:
-                    ConfigurableSemaphore::default(),
-                metric_collection_interval: Duration::from_secs(222),
-                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
-                metric_collection_bucket: None,
-                synthetic_size_calculation_interval: Duration::from_secs(333),
-                disk_usage_based_eviction: None,
-                test_remote_failures: 0,
-                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: 100,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            },
-            "Should be able to parse all basic config values correctly"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_remote_fs_storage_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = "http://127.0.0.1:7777";
-
-        let local_storage_path = tempdir.path().join("local_remote_storage");
-
-        let identical_toml_declarations = &[
-            format!(
-                r#"[remote_storage]
-local_path = '{local_storage_path}'"#,
-            ),
-            format!("remote_storage={{local_path='{local_storage_path}'}}"),
-        ];
-
-        for remote_storage_config_str in identical_toml_declarations {
-            let config_string = format!(
-                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-{remote_storage_config_str}"#,
-            );
-
-            let toml = config_string.parse()?;
-
-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for the local FS");
-
-            assert_eq!(
-                parsed_remote_storage_config,
-                RemoteStorageConfig {
-                    storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
-                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
-                },
-                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
-            );
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn parse_remote_s3_storage_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let bucket_name = "some-sample-bucket".to_string();
-        let bucket_region = "eu-north-1".to_string();
-        let prefix_in_bucket = "test_prefix".to_string();
-        let endpoint = "http://localhost:5000".to_string();
-        let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
-        let max_sync_errors = NonZeroU32::new(222).unwrap();
-        let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
-        let broker_endpoint = "http://127.0.0.1:7777";
-
-        let identical_toml_declarations = &[
-            format!(
-                r#"[remote_storage]
-max_concurrent_syncs = {max_concurrent_syncs}
-max_sync_errors = {max_sync_errors}
-bucket_name = '{bucket_name}'
-bucket_region = '{bucket_region}'
-prefix_in_bucket = '{prefix_in_bucket}'
-endpoint = '{endpoint}'
-concurrency_limit = {s3_concurrency_limit}"#
-            ),
-            format!(
-                "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
-                bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
-            ),
-        ];
-
-        for remote_storage_config_str in identical_toml_declarations {
-            let config_string = format!(
-                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-{remote_storage_config_str}"#,
-            );
-
-            let toml = config_string.parse()?;
-
-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for S3");
-
-            assert_eq!(
-                parsed_remote_storage_config,
-                RemoteStorageConfig {
-                    storage: RemoteStorageKind::AwsS3(S3Config {
-                        bucket_name: bucket_name.clone(),
-                        bucket_region: bucket_region.clone(),
-                        prefix_in_bucket: Some(prefix_in_bucket.clone()),
-                        endpoint: Some(endpoint.clone()),
-                        concurrency_limit: s3_concurrency_limit,
-                        max_keys_per_list_response: None,
-                        upload_storage_class: None,
-                    }),
-                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
-                },
-                "Remote storage config should correctly parse the S3 config"
-            );
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
-        let config_string = r#"
-            [tenant_config]
-            checkpoint_distance = -1 # supposed to be an u64
-        "#
-        .to_string();
-
-        let toml: Document = config_string.parse()?;
-        let item = toml.get("tenant_config").unwrap();
-        let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
-
-        let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
-        assert_eq!(error.to_string(), expected_error_str);
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_override_tenant_config() -> anyhow::Result<()> {
-        let config_string = r#"tenant_config={ min_resident_size_override =  400 }"#.to_string();
-
-        let toml: Document = config_string.parse()?;
-        let item = toml.get("tenant_config").unwrap();
-        let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
-
-        assert_eq!(conf.min_resident_size_override, Some(400));
-
-        Ok(())
-    }
-
-    #[test]
-    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{pg_distrib_dir}"
-metric_collection_endpoint = "http://sample.url"
-metric_collection_interval = "10min"
-
-[disk_usage_based_eviction]
-max_usage_pct = 80
-min_avail_bytes = 0
-period = "10s"
-
-[tenant_config]
-evictions_low_residence_duration_metric_threshold = "20m"
-
-[tenant_config.eviction_policy]
-kind = "LayerAccessThreshold"
-period = "20m"
-threshold = "20m"
-"#,
-        );
-        let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
-
-        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
-        assert_eq!(
-            conf.metric_collection_endpoint,
-            Some("http://sample.url".parse().unwrap())
-        );
-        assert_eq!(
-            conf.metric_collection_interval,
-            Duration::from_secs(10 * 60)
-        );
-        assert_eq!(
-            conf.default_tenant_conf
-                .evictions_low_residence_duration_metric_threshold,
-            Duration::from_secs(20 * 60)
-        );
-
-        // Assert that the node id provided by the indentity file (threaded
-        // through the call to [`PageServerConf::parse_and_validate`] is
-        // used.
-        assert_eq!(conf.id, NodeId(333));
-        assert_eq!(
-            conf.disk_usage_based_eviction,
-            Some(DiskUsageEvictionTaskConfig {
-                max_usage_pct: Percent::new(80).unwrap(),
-                min_avail_bytes: 0,
-                period: Duration::from_secs(10),
-                #[cfg(feature = "testing")]
-                mock_statvfs: None,
-                eviction_order: Default::default(),
-            })
-        );
-
-        match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
-                assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
-                assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
-            }
-            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_imitation_only_pageserver_config() {
-        let tempdir = tempdir().unwrap();
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
-
-        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{pg_distrib_dir}"
-metric_collection_endpoint = "http://sample.url"
-metric_collection_interval = "10min"
-
-[tenant_config]
-evictions_low_residence_duration_metric_threshold = "20m"
-
-[tenant_config.eviction_policy]
-kind = "OnlyImitiate"
-period = "20m"
-threshold = "20m"
-"#,
-        );
-        let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
-
-        match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::OnlyImitiate(t) => {
-                assert_eq!(t.period, Duration::from_secs(20 * 60));
-                assert_eq!(t.threshold, Duration::from_secs(20 * 60));
-            }
-            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
-        }
-    }
-
-    #[test]
-    fn empty_remote_storage_is_error() {
-        let tempdir = tempdir().unwrap();
-        let (workdir, _) = prepare_fs(&tempdir).unwrap();
+    fn test_empty_config_toml_is_valid() {
+        // we use Default impl of everything in this situation
         let input = r#"
-remote_storage = {}
         "#;
-        let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
-            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
-        assert!(format!("{err}").contains("remote_storage"), "{err}");
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("empty config is valid");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect("parse_and_validate");
     }
 
-    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
-        let tempdir_path = tempdir.path();
+    /// If there's a typo in the pageserver config, we'd rather catch that typo
+    /// and fail pageserver startup than silently ignoring the typo, leaving whoever
+    /// made it in the believe that their config change is effective.
+    ///
+    /// The default in serde is to allow unknown fields, so, we rely
+    /// on developer+review discipline to add `deny_unknown_fields` when adding
+    /// new structs to the config, and these tests here as a regression test.
+    ///
+    /// The alternative to all of this would be to allow unknown fields in the config.
+    /// To catch them, we could have a config check tool or mgmt API endpoint that
+    /// compares the effective config with the TOML on disk and makes sure that
+    /// the on-disk TOML is a strict subset of the effective config.
+    mod unknown_fields_handling {
+        macro_rules! test {
+            ($short_name:ident, $input:expr) => {
+                #[test]
+                fn $short_name() {
+                    let input = $input;
+                    let err = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
+                        .expect_err("some_invalid_field is an invalid field");
+                    dbg!(&err);
+                    assert!(err.to_string().contains("some_invalid_field"));
+                }
+            };
+        }
+        use indoc::indoc;
 
-        let workdir = tempdir_path.join("workdir");
-        fs::create_dir_all(&workdir)?;
+        test!(
+            toplevel,
+            indoc! {r#"
+                some_invalid_field = 23
+            "#}
+        );
 
-        let pg_distrib_dir = tempdir_path.join("pg_distrib");
-        let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}"));
-        fs::create_dir_all(&pg_distrib_dir_versioned)?;
-        let postgres_bin_dir = pg_distrib_dir_versioned.join("bin");
-        fs::create_dir_all(&postgres_bin_dir)?;
-        fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?;
+        test!(
+            toplevel_nested,
+            indoc! {r#"
+                [some_invalid_field]
+                foo = 23
+            "#}
+        );
 
-        Ok((workdir, pg_distrib_dir))
+        test!(
+            disk_usage_based_eviction,
+            indoc! {r#"
+                [disk_usage_based_eviction]
+                some_invalid_field = 23
+            "#}
+        );
+
+        test!(
+            tenant_config,
+            indoc! {r#"
+                [tenant_config]
+                some_invalid_field = 23
+            "#}
+        );
+
+        test!(
+            l0_flush,
+            indoc! {r#"
+                [l0_flush]
+                mode = "direct"
+                some_invalid_field = 23
+            "#}
+        );
+
+        // TODO: fix this => https://github.com/neondatabase/neon/issues/8915
+        // test!(
+        //     remote_storage_config,
+        //     indoc! {r#"
+        //         [remote_storage_config]
+        //         local_path = "/nonexistent"
+        //         some_invalid_field = 23
+        //     "#}
+        // );
+
+        test!(
+            compact_level0_phase1_value_access,
+            indoc! {r#"
+                [compact_level0_phase1_value_access]
+                mode = "streaming-kmerge"
+                some_invalid_field = 23
+            "#}
+        );
     }
 }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 5e4a49bc56..a58fa2c0b1 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -41,19 +41,15 @@
 // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
 //   reading these fields. We use the Debug impl for semi-structured logging, though.
 
-use std::{
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::{sync::Arc, time::SystemTime};
 
 use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
 use remote_storage::GenericRemoteStorage;
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::serde_percent::Percent;
 use utils::{completion, id::TimelineId};
 
 use crate::{
@@ -69,23 +65,9 @@ use crate::{
     CancellableTask, DiskUsageEvictionTask,
 };
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct DiskUsageEvictionTaskConfig {
-    pub max_usage_pct: Percent,
-    pub min_avail_bytes: u64,
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[cfg(feature = "testing")]
-    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
-    /// Select sorting for evicted layers
-    #[serde(default)]
-    pub eviction_order: EvictionOrder,
-}
-
 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "type", content = "args")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum EvictionOrder {
     /// Order the layers to be evicted by how recently they have been accessed relatively within
     /// the set of resident layers of a tenant.
@@ -96,23 +78,22 @@ pub enum EvictionOrder {
         /// we read tenants is deterministic. If we find the need to use this as `false`, we need
         /// to ensure nondeterminism by adding in a random number to break the
         /// `relative_last_activity==0.0` ties.
-        #[serde(default = "default_highest_layer_count_loses_first")]
         highest_layer_count_loses_first: bool,
     },
 }
 
-impl Default for EvictionOrder {
-    fn default() -> Self {
-        Self::RelativeAccessed {
-            highest_layer_count_loses_first: true,
+impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
+    fn from(value: pageserver_api::config::EvictionOrder) -> Self {
+        match value {
+            pageserver_api::config::EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first,
+            } => Self::RelativeAccessed {
+                highest_layer_count_loses_first,
+            },
         }
     }
 }
 
-fn default_highest_layer_count_loses_first() -> bool {
-    true
-}
-
 impl EvictionOrder {
     fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
         use EvictionOrder::*;
@@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration(
         storage,
         usage_pre,
         tenant_manager,
-        task_config.eviction_order,
+        task_config.eviction_order.into(),
         cancel,
     )
     .await;
@@ -1257,7 +1238,6 @@ mod filesystem_level_usage {
 
     #[test]
     fn max_usage_pct_pressure() {
-        use super::EvictionOrder;
         use super::Usage as _;
         use std::time::Duration;
         use utils::serde_percent::Percent;
@@ -1269,7 +1249,7 @@ mod filesystem_level_usage {
                 period: Duration::MAX,
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
-                eviction_order: EvictionOrder::default(),
+                eviction_order: pageserver_api::config::EvictionOrder::default(),
             },
             total_bytes: 100_000,
             avail_bytes: 0,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 90ae6c5557..d645f3b7b6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2076,7 +2076,7 @@ async fn disk_usage_eviction_run(
         evict_bytes: u64,
 
         #[serde(default)]
-        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
+        eviction_order: pageserver_api::config::EvictionOrder,
     }
 
     #[derive(Debug, Clone, Copy, serde::Serialize)]
@@ -2112,7 +2112,7 @@ async fn disk_usage_eviction_run(
         &state.remote_storage,
         usage,
         &state.tenant_manager,
-        config.eviction_order,
+        config.eviction_order.into(),
         &cancel,
     )
     .await;
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 313a7961a6..491c9fb96c 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,9 +1,7 @@
 use std::{num::NonZeroUsize, sync::Arc};
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub enum L0FlushConfig {
-    #[serde(rename_all = "snake_case")]
     Direct { max_concurrency: NonZeroUsize },
 }
 
@@ -16,6 +14,16 @@ impl Default for L0FlushConfig {
     }
 }
 
+impl From<pageserver_api::models::L0FlushConfig> for L0FlushConfig {
+    fn from(config: pageserver_api::models::L0FlushConfig) -> Self {
+        match config {
+            pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => {
+                Self::Direct { max_concurrency }
+            }
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);
 
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index ede1791afa..5a6f6e5176 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -60,32 +60,7 @@ pub mod mock {
     use regex::Regex;
     use tracing::log::info;
 
-    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[serde(tag = "type")]
-    pub enum Behavior {
-        Success {
-            blocksize: u64,
-            total_blocks: u64,
-            name_filter: Option<utils::serde_regex::Regex>,
-        },
-        Failure {
-            mocked_error: MockedError,
-        },
-    }
-
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[allow(clippy::upper_case_acronyms)]
-    pub enum MockedError {
-        EIO,
-    }
-
-    impl From<MockedError> for nix::Error {
-        fn from(e: MockedError) -> Self {
-            match e {
-                MockedError::EIO => nix::Error::EIO,
-            }
-        }
-    }
+    pub use pageserver_api::config::statvfs::mock::Behavior;
 
     pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
         info!("running mocked statvfs");
@@ -116,6 +91,7 @@ pub mod mock {
                     block_size: *blocksize,
                 })
             }
+            #[cfg(feature = "testing")]
             Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 48ff17db94..7e0344666b 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,11 +9,10 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::LsnLease;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
@@ -23,50 +22,6 @@ use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
 
-pub mod defaults {
-
-    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
-    // would be more appropriate. But a low value forces the code to be exercised more,
-    // which is good for now to trigger bugs.
-    // This parameter actually determines L0 layer file size.
-    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
-
-    // FIXME the below configs are only used by legacy algorithm. The new algorithm
-    // has different parameters.
-
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
-
-    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
-    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;
-
-    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-
-    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
-    // If there's a need to decrease this value, first make sure that GC
-    // doesn't hold a layer map write lock for non-trivial operations.
-    // Relevant: https://github.com/neondatabase/neon/issues/3394
-    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
-    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
-    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
-    // throughputs up to 1GiB/s per timeline.
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
-    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-    // By default ingest enough WAL for two new L0 layers before checking if new image
-    // image layers should be created.
-    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-}
-
 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
     /// Our generation is current as far as we know, and as far as we know we are the only attached
@@ -281,96 +236,20 @@ impl LocationConf {
     }
 }
 
-/// A tenant's calcuated configuration, which is the result of merging a
-/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
-///
-/// For storing and transmitting individual tenant's configuration, see
-/// TenantConfOpt.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct TenantConf {
-    // Flush out an inmemory layer, if it's holding WAL older than this
-    // This puts a backstop on how much WAL needs to be re-digested if the
-    // page server crashes.
-    // This parameter actually determines L0 layer file size.
-    pub checkpoint_distance: u64,
-    // Inmemory layer is also flushed at least once in checkpoint_timeout to
-    // eventually upload WAL after activity is stopped.
-    #[serde(with = "humantime_serde")]
-    pub checkpoint_timeout: Duration,
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub compaction_target_size: u64,
-    // How often to check if there's compaction work to be done.
-    // Duration::ZERO means automatic compaction is disabled.
-    #[serde(with = "humantime_serde")]
-    pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
-    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithmSettings,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is #of bytes of WAL.
-    // Page versions older than this are garbage collected away.
-    pub gc_horizon: u64,
-    // Interval at which garbage collection is triggered.
-    // Duration::ZERO means automatic GC is disabled
-    #[serde(with = "humantime_serde")]
-    pub gc_period: Duration,
-    // Delta layer churn threshold to create L1 image layers.
-    pub image_creation_threshold: usize,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is time.
-    // Page versions older than this are garbage collected away.
-    #[serde(with = "humantime_serde")]
-    pub pitr_interval: Duration,
-    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
-    #[serde(with = "humantime_serde")]
-    pub walreceiver_connect_timeout: Duration,
-    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
-    /// A stalled safekeeper will be changed to a newer one when it appears.
-    #[serde(with = "humantime_serde")]
-    pub lagging_wal_timeout: Duration,
-    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
-    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
-    /// to avoid eager reconnects.
-    pub max_lsn_wal_lag: NonZeroU64,
-    pub eviction_policy: EvictionPolicy,
-    pub min_resident_size_override: Option<u64>,
-    // See the corresponding metric's help string.
-    #[serde(with = "humantime_serde")]
-    pub evictions_low_residence_duration_metric_threshold: Duration,
-
-    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
-    /// may be disabled if a Tenant will not have secondary locations: only secondary
-    /// locations will use the heatmap uploaded by attached locations.
-    #[serde(with = "humantime_serde")]
-    pub heatmap_period: Duration,
-
-    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
-    pub lazy_slru_download: bool,
-
-    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
-
-    // How much WAL must be ingested before checking again whether a new image layer is required.
-    // Expresed in multiples of checkpoint distance.
-    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: AuxFilePolicy,
-
-    /// The length for an explicit LSN lease request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length: Duration,
-
-    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length_for_ts: Duration,
+impl Default for LocationConf {
+    // TODO: this should be removed once tenant loading can guarantee that we are never
+    // loading from a directory without a configuration.
+    // => tech debt since https://github.com/neondatabase/neon/issues/1555
+    fn default() -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation: Generation::none(),
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf: TenantConfOpt::default(),
+            shard: ShardIdentity::unsharded(),
+        }
+    }
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -545,51 +424,6 @@ impl TenantConfOpt {
     }
 }
 
-impl Default for TenantConf {
-    fn default() -> Self {
-        use defaults::*;
-        Self {
-            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
-                .expect("cannot parse default checkpoint timeout"),
-            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
-            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
-                .expect("cannot parse default compaction period"),
-            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
-            },
-            gc_horizon: DEFAULT_GC_HORIZON,
-            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
-                .expect("cannot parse default gc period"),
-            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
-                .expect("cannot parse default PITR interval"),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .expect("cannot parse default walreceiver connect timeout"),
-            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
-                .expect("cannot parse default walreceiver lagging wal timeout"),
-            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .expect("cannot parse default max walreceiver Lsn wal lag"),
-            eviction_policy: EvictionPolicy::NoEviction,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
-                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
-            )
-            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            heatmap_period: Duration::ZERO,
-            lazy_slru_download: false,
-            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
-            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
-            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
-            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
-        }
-    }
-}
-
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
     type Error = anyhow::Error;
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b8e9a98149..6a2cd94232 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
@@ -52,6 +52,7 @@ use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 4a095c564d..77ce1ae670 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -34,8 +34,7 @@ use crate::tenant::disk_btree::{
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadPlanner,
+    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
@@ -46,6 +45,7 @@ use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 2c19e5b19f..e487bee1f2 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -215,7 +215,7 @@ impl IndexEntry {
 
     const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
         let res = Self::validate_checkpoint_distance(
-            crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
         );
         if res.is_err() {
             panic!("default checkpoint distance is valid")
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index f5680ced90..478e9bb4f0 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
@@ -456,9 +455,11 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
 
             // If compaction period is set to zero (to disable it), then we will use a reasonable default
             let period = if period == Duration::ZERO {
-                humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
-                    .unwrap()
-                    .into()
+                humantime::Duration::from_str(
+                    pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
+                )
+                .unwrap()
+                .into()
             } else {
                 period
             };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3b8f19a6c0..262dccac7d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,7 +66,6 @@ use std::{
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
-        config::defaults::DEFAULT_PITR_INTERVAL,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
@@ -102,6 +101,7 @@ use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index aad75ac59c..6b9c8386f7 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,6 +19,7 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -29,7 +30,6 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
@@ -43,6 +43,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use pageserver_api::config::tenant_conf_defaults::{
+    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
+};
 
 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
@@ -1433,43 +1436,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 146bcf0e35..4d51dc442d 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -16,7 +16,6 @@
 //! Note that the vectored blob api does *not* go through the page cache.
 
 use std::collections::BTreeMap;
-use std::num::NonZeroUsize;
 
 use bytes::BytesMut;
 use pageserver_api::key::Key;
@@ -29,9 +28,6 @@ use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::{self, VirtualFile};
 
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub struct MaxVectoredReadBytes(pub NonZeroUsize);
-
 /// Metadata bundled with the start and end offset of a blob.
 #[derive(Copy, Clone, Debug)]
 pub struct BlobMeta {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 97d966e2da..ed6ff86c10 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,7 +10,6 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
@@ -19,6 +18,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::io_buf_ext::FullSlice;
+use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index faef1ba9ff..ccde90ee1a 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine {
                         }
                     },
                     Err(std::env::VarError::NotPresent) => {
-                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                            .parse()
-                            .unwrap()
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoEngineKind::TokioEpollUring
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoEngineKind::StdFs
+                        }
                     }
                     Err(std::env::VarError::NotUnicode(_)) => {
                         panic!("env var {env_var_name} is not unicode");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 890538b86a..2df45a7e0e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -24,7 +24,20 @@ from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 from urllib.parse import quote, urlparse
 
 import asyncpg
@@ -90,6 +103,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
 from .neon_api import NeonAPI, NeonApiEndpoint
 
+T = TypeVar("T")
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -2986,16 +3001,17 @@ class NeonPageserver(PgProtocol, LogUtils):
     def config_toml_path(self) -> Path:
         return self.workdir / "pageserver.toml"
 
-    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
+    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T:
         """
         Edit the pageserver's config toml file in place.
         """
         path = self.config_toml_path
         with open(path, "r") as f:
             config = toml.load(f)
-        edit_fn(config)
+        res = edit_fn(config)
         with open(path, "w") as f:
             toml.dump(config, f)
+        return res
 
     def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 73af7950f1..ebf58d2bd1 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
     env.storage_controller.node_register(env.pageserver)
 
-    replaced_config = env.pageserver.patch_config_toml_nonrecursive(
-        {
-            "control_plane_api": "",
-        }
-    )
+    def remove_control_plane_api_field(config):
+        return config.pop("control_plane_api")
+
+    control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field)
     env.pageserver.start()
     env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
 
@@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.stop()
     # Starting without the override that disabled control_plane_api
-    env.pageserver.patch_config_toml_nonrecursive(replaced_config)
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "control_plane_api": control_plane_api,
+        }
+    )
     env.pageserver.start()
 
     generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 642b9e449b..9bf5f8680b 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     # We will run with the limit set to 1, so that once we have one tenant stuck
     # in a pausable failpoint, the rest are prevented from proceeding through warmup.
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
@@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
 
 
 def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
 
@@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
 @pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
 def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
     # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
 

From efe03d5a1ccce8e0f53e733d61fd0e3d0dd904f8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 5 Sep 2024 16:29:48 +0300
Subject: [PATCH 094/159] build: sync between benchies (#8919)

Sometimes, the benchmarks fail to start up pageserver in 10s without any
obvious reason. Benchmarks run sequentially on otherwise idle runners.
Try running `sync(2)` after each bench to force a cleaner slate.

Implement this via:
- SYNC_AFTER_EACH_TEST environment variable enabled autouse fixture
- autouse fixture seems to be outermost fixture, so it works as expected
- set SYNC_AFTER_EACH_TEST=true for benchmarks in build_and_test
workflow

Evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10678984691/index.html#suites/5008d72a1ba3c0d618a030a938fc035c/1210266507534c0f/

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml     |  1 +
 test_runner/fixtures/compare_fixtures.py | 26 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 53d33b420f..ee5fd1b0c6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -286,6 +286,7 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          SYNC_AFTER_EACH_TEST: true
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 98a9dd7184..7c4a8db36f 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -1,3 +1,5 @@
+import os
+import time
 from abc import ABC, abstractmethod
 from contextlib import _GeneratorContextManager, contextmanager
 
@@ -8,6 +10,7 @@ import pytest
 from _pytest.fixtures import FixtureRequest
 
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     PgBin,
@@ -333,3 +336,26 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare:
     fixture = request.getfixturevalue(request.param)
     assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare"
     return fixture
+
+
+@pytest.fixture(scope="function", autouse=True)
+def sync_after_each_test():
+    # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true`
+    #
+    # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`)
+    # that are run on self-hosted runners because some of these tests are pretty write-heavy
+    # and create issues to start the processes within 10s
+    key = "SYNC_AFTER_EACH_TEST"
+    enabled = os.environ.get(key) == "true"
+
+    yield
+
+    if not enabled:
+        # regress test, or running locally
+        return
+
+    start = time.time()
+    # we only run benches on unices, the method might not exist on windows
+    os.sync()
+    elapsed = time.time() - start
+    log.info(f"called sync after test {elapsed=}")

From ebddda5b7f85587998df00dbf7dc88679459b494 Mon Sep 17 00:00:00 2001
From: vladov <vladov3000@gmail.com>
Date: Thu, 5 Sep 2024 08:06:57 -0700
Subject: [PATCH 095/159] Fix precedence issue causing yielding loop to never
 yield. (#8922)

There is a bug in `yielding_loop` that causes it to never yield.

## Summary of changes

Fixed the bug. `i + 1 % interval == 0` will always evaluate to `i + 1 ==
0` which is false
([Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=68e6ca393a02113cb7720115c2842e75)).
This function is called in 2 places
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/scheduler.rs#L389)
and
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/heatmap_uploader.rs#L152)
with `interval == 1000` in both cases.

This may change the performance of the system since now we are yielding
to tokio. Also, this may expose undefined behavior since it is now
possible for tasks to be moved between threads/whatever tokio does to
tasks. However, this was the intention of the author of the code.
---
 libs/utils/src/yielding_loop.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 963279eb4c..41c4cee45d 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -23,7 +23,7 @@ where
     for (i, item) in iter.enumerate() {
         visitor(item);
 
-        if i + 1 % interval == 0 {
+        if (i + 1) % interval == 0 {
             tokio::task::yield_now().await;
             if cancel.is_cancelled() {
                 return Err(YieldingLoopError::Cancelled);

From fd12dd942f61a0a22016fa219f4b3a87c81dc0b0 Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Thu, 5 Sep 2024 17:48:51 +0200
Subject: [PATCH 096/159] Add installation instructions for m4 on mac (#8929)

## Problem
Building on MacOS failed due to missing m4. Although a window was
popping up claiming to install m4, this was not helping.

## Summary of changes
Add instructions to install m4 using brew and link it (thanks to Folke
for helping).
---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 735edef0fc..b54956f773 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```
 
+If you get errors about missing `m4` you may have to install it manually:
+```
+brew install m4
+brew link --force m4
+```
+
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
 # recommended approach from https://www.rust-lang.org/tools/install

From 04f99a87bfee4da41df2bd5724e73b3646c2bf3e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 5 Sep 2024 19:14:21 +0100
Subject: [PATCH 097/159] storcon: make pageserver AZ id mandatory (#8856)

## Problem
https://github.com/neondatabase/neon/pull/8852 introduced a new nullable
column for the `nodes` table: `availability_zone_id`

## Summary of changes
* Make neon local and the test suite always provide an az id
* Make the az id field in the ps registration request mandatory
* Migrate the column to non-nullable and adjust in memory state
accordingly
* Remove the code that was used to populate the az id for pre-existing
nodes
---
 Dockerfile                                    |  1 +
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/pageserver_api/src/controller_api.rs     |  2 +-
 pageserver/src/control_plane_client.rs        | 24 +++++++---
 .../down.sql                                  |  1 +
 .../up.sql                                    |  1 +
 storage_controller/src/node.rs                | 23 +++-------
 storage_controller/src/persistence.rs         | 28 +-----------
 storage_controller/src/scheduler.rs           |  2 +-
 storage_controller/src/schema.rs              |  2 +-
 storage_controller/src/service.rs             | 44 ++-----------------
 test_runner/fixtures/neon_fixtures.py         |  3 ++
 12 files changed, 41 insertions(+), 92 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql

diff --git a/Dockerfile b/Dockerfile
index d3d12330c6..1efedfa9bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,6 +87,7 @@ RUN mkdir -p /data/.neon/ && \
        "pg_distrib_dir='/usr/local/'\n" \
        "listen_pg_addr='0.0.0.0:6400'\n" \
        "listen_http_addr='0.0.0.0:9898'\n" \
+       "availability_zone='local'\n" \
   > /data/.neon/pageserver.toml && \
   chown -R neon:neon /data/.neon
 
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 5cce6cf3ae..2a81a3d825 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -336,7 +336,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
-                        availability_zone_id: Some(availability_zone_id),
+                        availability_zone_id,
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 345abd69b6..6fb5a9a139 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -57,7 +57,7 @@ pub struct NodeRegisterRequest {
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
-    pub availability_zone_id: Option<String>,
+    pub availability_zone_id: String,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 56a536c387..f6d1c35a8c 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,10 +141,24 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         m.other
                     );
 
-                    let az_id = m
-                        .other
-                        .get("availability_zone_id")
-                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+                    let az_id = {
+                        let az_id_from_metadata = m
+                            .other
+                            .get("availability_zone_id")
+                            .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+
+                        match az_id_from_metadata {
+                            Some(az_id) => Some(az_id),
+                            None => {
+                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
+                                conf.availability_zone.clone()
+                            }
+                        }
+                    };
+
+                    if az_id.is_none() {
+                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
+                    }
 
                     Some(NodeRegisterRequest {
                         node_id: conf.id,
@@ -152,7 +166,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
-                        availability_zone_id: az_id,
+                        availability_zone_id: az_id.expect("Checked above"),
                     })
                 }
                 Err(e) => {
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
new file mode 100644
index 0000000000..4fcb928533
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL;
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
new file mode 100644
index 0000000000..c5b4534087
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 73cecc491d..cb9ce10d23 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -36,7 +36,7 @@ pub(crate) struct Node {
     listen_pg_addr: String,
     listen_pg_port: u16,
 
-    availability_zone_id: Option<String>,
+    availability_zone_id: String,
 
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
@@ -63,8 +63,9 @@ impl Node {
         self.id
     }
 
-    pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
-        self.availability_zone_id.as_deref()
+    #[allow(unused)]
+    pub(crate) fn get_availability_zone_id(&self) -> &str {
+        self.availability_zone_id.as_str()
     }
 
     pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
@@ -78,22 +79,12 @@ impl Node {
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
-        let az_ids_match = {
-            match (
-                self.availability_zone_id.as_deref(),
-                register_req.availability_zone_id.as_deref(),
-            ) {
-                (Some(current_az), Some(register_req_az)) => current_az == register_req_az,
-                _ => true,
-            }
-        };
-
-        az_ids_match
-            && self.id == register_req.node_id
+        self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
             && self.listen_pg_addr == register_req.listen_pg_addr
             && self.listen_pg_port == register_req.listen_pg_port
+            && self.availability_zone_id == register_req.availability_zone_id
     }
 
     /// For a shard located on this node, populate a response object
@@ -190,7 +181,7 @@ impl Node {
         listen_http_port: u16,
         listen_pg_addr: String,
         listen_pg_port: u16,
-        availability_zone_id: Option<String>,
+        availability_zone_id: String,
     ) -> Self {
         Self {
             id,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index e801289752..6df05ebd13 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,7 +105,6 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
-    SetNodeAzId,
 }
 
 #[must_use]
@@ -325,31 +324,6 @@ impl Persistence {
         }
     }
 
-    pub(crate) async fn set_node_availability_zone_id(
-        &self,
-        input_node_id: NodeId,
-        input_az_id: String,
-    ) -> DatabaseResult<()> {
-        use crate::schema::nodes::dsl::*;
-        let updated = self
-            .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((availability_zone_id.eq(input_az_id.clone()),))
-                    .execute(conn)?;
-                Ok(updated)
-            })
-            .await?;
-
-        if updated != 1 {
-            Err(DatabaseError::Logical(format!(
-                "Node {node_id:?} not found for setting az id",
-            )))
-        } else {
-            Ok(())
-        }
-    }
-
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -1110,7 +1084,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_http_port: i32,
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
-    pub(crate) availability_zone_id: Option<String>,
+    pub(crate) availability_zone_id: String,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index ef4da6861c..deb5f27226 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -528,7 +528,7 @@ pub(crate) mod test_utils {
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
-                        None,
+                        "test-az".to_string(),
                     );
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index e0f515daea..93ab774b5f 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -25,7 +25,7 @@ diesel::table! {
         listen_http_port -> Int4,
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
-        availability_zone_id -> Nullable<Varchar>,
+        availability_zone_id -> Varchar,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ca416095bb..2911cd5ac4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1264,7 +1264,7 @@ impl Service {
                     123,
                     "".to_string(),
                     123,
-                    None,
+                    "test_az".to_string(),
                 );
 
                 scheduler.node_upsert(&node);
@@ -4825,15 +4825,8 @@ impl Service {
         )
         .await;
 
-        if register_req.availability_zone_id.is_none() {
-            tracing::warn!(
-                "Node {} registering without specific availability zone id",
-                register_req.node_id
-            );
-        }
-
         enum RegistrationStatus {
-            Matched(Node),
+            Matched,
             Mismatched,
             New,
         }
@@ -4842,7 +4835,7 @@ impl Service {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 if node.registration_match(&register_req) {
-                    RegistrationStatus::Matched(node.clone())
+                    RegistrationStatus::Matched
                 } else {
                     RegistrationStatus::Mismatched
                 }
@@ -4852,41 +4845,12 @@ impl Service {
         };
 
         match registration_status {
-            RegistrationStatus::Matched(node) => {
+            RegistrationStatus::Matched => {
                 tracing::info!(
                     "Node {} re-registered with matching address",
                     register_req.node_id
                 );
 
-                if node.get_availability_zone_id().is_none() {
-                    if let Some(az_id) = register_req.availability_zone_id.clone() {
-                        tracing::info!("Extracting availability zone id from registration request for node {}: {}",
-                                       register_req.node_id, az_id);
-
-                        // Persist to the database and update in memory state. See comment below
-                        // on ordering.
-                        self.persistence
-                            .set_node_availability_zone_id(register_req.node_id, az_id)
-                            .await?;
-                        let node_with_az = Node::new(
-                            register_req.node_id,
-                            register_req.listen_http_addr,
-                            register_req.listen_http_port,
-                            register_req.listen_pg_addr,
-                            register_req.listen_pg_port,
-                            register_req.availability_zone_id,
-                        );
-
-                        let mut locked = self.inner.write().unwrap();
-                        let mut new_nodes = (*locked.nodes).clone();
-
-                        locked.scheduler.node_upsert(&node_with_az);
-                        new_nodes.insert(register_req.node_id, node_with_az);
-
-                        locked.nodes = Arc::new(new_nodes);
-                    }
-                }
-
                 return Ok(());
             }
             RegistrationStatus::Mismatched => {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2df45a7e0e..0c692ceb69 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -758,6 +758,9 @@ class NeonEnvBuilder:
         patch_script = ""
         for ps in self.env.pageservers:
             patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+            # This is a temporary to get the backward compat test happy
+            # since the compat snapshot was generated with an older version of neon local
+            patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}'  WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;"
         patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines

From cf11c8ab6aa234b59354425116da98d58fa1826d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:52:29 +0200
Subject: [PATCH 098/159] update svg_fmt to 0.4.3 (#8930)

Audited

```
diff -r -u ~/.cargo/registry/src/index.crates.io-6f17d22bba15001f/svg_fmt-0.4.{2,3}
```

fixes https://github.com/neondatabase/neon/issues/7763
---
 Cargo.lock | 5 +++--
 Cargo.toml | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 91917d5351..3f2787f15b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6094,8 +6094,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
-version = "0.4.2"
-source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 4fea3e8d80..2415337110 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -161,8 +161,7 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
-svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
+svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"

From 06e840b884c242550e2a5ad0e72bfa762bce1709 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:58:48 +0200
Subject: [PATCH 099/159] compact_level0_phase1: ignore access mode config,
 always do streaming-kmerge without validation (#8934)

refs https://github.com/neondatabase/neon/issues/8184

PR https://github.com/neondatabase/infra/pull/1905 enabled
streaming-kmerge without validation everywhere.

It rolls into prod sooner or in the same release as this PR.
---
 libs/pageserver_api/src/config.rs            |  43 +-----
 pageserver/src/bin/pageserver.rs             |   1 -
 pageserver/src/config.rs                     |  26 ++--
 pageserver/src/tenant/timeline/compaction.rs | 139 +------------------
 4 files changed, 22 insertions(+), 187 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index b2662c562a..1194ee93ef 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,7 +104,9 @@ pub struct ConfigToml {
     pub image_compression: ImageCompressionAlgorithm,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    #[serde(skip_serializing)]
+    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
+    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
     pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
     pub io_buffer_alignment: usize,
 }
@@ -209,43 +211,6 @@ pub enum GetImpl {
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 /// A tenant's calcuated configuration, which is the result of merging a
 /// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
 ///
@@ -452,7 +417,7 @@ impl Default for ConfigToml {
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            compact_level0_phase1_value_access: Default::default(),
             virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
 
             io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2c60e8d7d1..59194ab4bd 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,7 +125,6 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
     info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
 
     // The tenants directory contains all the pageserver local disk state.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c159b66905..4e68e276d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,10 +174,6 @@ pub struct PageServerConf {
 
     pub l0_flush: crate::l0_flush::L0FlushConfig,
 
-    /// This flag is temporary and will be removed after gradual rollout.
-    /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
-
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
 
@@ -338,7 +334,7 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
+            compact_level0_phase1_value_access: _,
             l0_flush,
             virtual_file_direct_io,
             concurrent_tenant_warmup,
@@ -383,7 +379,6 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
             virtual_file_direct_io,
             io_buffer_alignment,
 
@@ -561,6 +556,16 @@ mod tests {
             .expect("parse_and_validate");
     }
 
+    #[test]
+    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
+        let input = indoc::indoc! {r#"
+            [compact_level0_phase1_value_access]
+            mode = "streaming-kmerge"
+            validate = "key-lsn-value"
+        "#};
+        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
+    }
+
     /// If there's a typo in the pageserver config, we'd rather catch that typo
     /// and fail pageserver startup than silently ignoring the typo, leaving whoever
     /// made it in the believe that their config change is effective.
@@ -637,14 +642,5 @@ mod tests {
         //         some_invalid_field = 23
         //     "#}
         // );
-
-        test!(
-            compact_level0_phase1_value_access,
-            indoc! {r#"
-                [compact_level0_phase1_value_access]
-                mode = "streaming-kmerge"
-                some_invalid_field = 23
-            "#}
-        );
     }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6b9c8386f7..a87b502cd6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,7 +19,6 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -912,137 +911,13 @@ impl Timeline {
         // we're compacting, in key, LSN order.
         // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
         // then the Value::Image is ordered before Value::WalRecord.
-        //
-        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
-        // option and validation code once we've reached confidence.
-        enum AllValuesIter<'a> {
-            PageCachedBlobIo {
-                all_keys_iter: VecIter<'a>,
-            },
-            StreamingKmergeBypassingPageCache {
-                merge_iter: MergeIterator<'a>,
-            },
-            ValidatingStreamingKmergeBypassingPageCache {
-                mode: CompactL0BypassPageCacheValidation,
-                merge_iter: MergeIterator<'a>,
-                all_keys_iter: VecIter<'a>,
-            },
-        }
-        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
-        impl AllValuesIter<'_> {
-            async fn next_all_keys_iter(
-                iter: &mut VecIter<'_>,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                let Some(DeltaEntry {
-                    key,
-                    lsn,
-                    val: value_ref,
-                    ..
-                }) = iter.next()
-                else {
-                    return Ok(None);
-                };
-                let value = value_ref.load(ctx).await?;
-                Ok(Some((*key, *lsn, value)))
-            }
-            async fn next(
-                &mut self,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                match self {
-                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
-                      Self::next_all_keys_iter(iter, ctx).await
-                    }
-                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
-                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
-                        // advance both iterators
-                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
-                        let merge_iter_item = merge_iter.next().await;
-                        // compare results & log warnings as needed
-                        macro_rules! rate_limited_warn {
-                            ($($arg:tt)*) => {{
-                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
-                                    warn!($($arg)*);
-                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
-                                }
-                                use once_cell::sync::Lazy;
-                                use utils::rate_limit::RateLimit;
-                                use std::sync::Mutex;
-                                use std::time::Duration;
-                                static LOGGED: Lazy<Mutex<RateLimit>> =
-                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                                let mut rate_limit = LOGGED.lock().unwrap();
-                                rate_limit.call(|| {
-                                    warn!($($arg)*);
-                                });
-                            }}
-                        }
-                        match (&all_keys_iter_item, &merge_iter_item) {
-                            (Err(_), Err(_)) => {
-                                // don't bother asserting equivality of the errors
-                            }
-                            (Err(all_keys), Ok(merge)) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
-                            },
-                            (Ok(all_keys), Err(merge)) => {
-                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
-                            },
-                            (Ok(None), Ok(None)) => { }
-                            (Ok(Some(all_keys)), Ok(None)) => {
-                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
-                            }
-                            (Ok(None), Ok(Some(merge))) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
-                            }
-                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
-                                match mode {
-                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
-                                    CompactL0BypassPageCacheValidation::KeyLsn => {
-                                        let all_keys = (all_keys_key, all_keys_lsn);
-                                        let merge = (merge_key, merge_lsn);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
-                                        }
-                                    }
-                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
-                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
-                                        let merge = (merge_key, merge_lsn, merge_value);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        // in case of mismatch, trust the legacy all_keys_iter_item
-                        all_keys_iter_item
-                    }.instrument(info_span!("next")).await
-                }
-            }
-        }
-        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
-            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
-                all_keys_iter: all_keys.iter(),
-            },
-            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
-                let merge_iter = {
-                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
-                    for l in deltas_to_compact.iter() {
-                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                        deltas.push(l);
-                    }
-                    MergeIterator::create(&deltas, &[], ctx)
-                };
-                match validate {
-                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
-                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
-                        mode: validate.clone(),
-                        merge_iter,
-                        all_keys_iter: all_keys.iter(),
-                    },
-                }
+        let mut all_values_iter = {
+            let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+            for l in deltas_to_compact.iter() {
+                let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                deltas.push(l);
             }
+            MergeIterator::create(&deltas, &[], ctx)
         };
 
         // This iterator walks through all keys and is needed to calculate size used by each key
@@ -1119,7 +994,7 @@ impl Timeline {
         let mut keys = 0;
 
         while let Some((key, lsn, value)) = all_values_iter
-            .next(ctx)
+            .next()
             .await
             .map_err(CompactionError::Other)?
         {

From a1323231bc65539f55eb1bfd341fb65d06d0ed22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 12:40:19 +0200
Subject: [PATCH 100/159] Update Rust to 1.81.0 (#8939)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1810-2024-09-05).

Prior update was in #8667 and #8518
---
 Cargo.lock                                    | 30 +++++--------------
 Cargo.toml                                    |  2 +-
 Dockerfile.build-tools                        |  2 +-
 libs/postgres_ffi/build.rs                    |  2 +-
 libs/walproposer/build.rs                     | 21 ++++++++++---
 libs/walproposer/src/api_bindings.rs          | 10 +++----
 .../tenant/remote_timeline_client/download.rs |  3 +-
 proxy/src/console/provider/neon.rs            |  5 +---
 rust-toolchain.toml                           |  2 +-
 safekeeper/src/send_wal.rs                    |  5 ++--
 workspace_hack/Cargo.toml                     |  6 ++--
 11 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f2787f15b..634af67198 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -915,25 +915,22 @@ dependencies = [
 
 [[package]]
 name = "bindgen"
-version = "0.65.1"
+version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
+checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "lazy_static",
- "lazycell",
+ "itertools 0.12.1",
  "log",
- "peeking_take_while",
- "prettyplease 0.2.6",
+ "prettyplease 0.2.17",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
  "syn 2.0.52",
- "which",
 ]
 
 [[package]]
@@ -2949,12 +2946,6 @@ dependencies = [
  "spin 0.5.2",
 ]
 
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3977,12 +3968,6 @@ dependencies = [
  "sha2",
 ]
 
-[[package]]
-name = "peeking_take_while"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
-
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -4280,9 +4265,9 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.6"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
+checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
  "syn 2.0.52",
@@ -7628,6 +7613,7 @@ dependencies = [
  "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools 0.10.5",
+ "itertools 0.12.1",
  "lazy_static",
  "libc",
  "log",
diff --git a/Cargo.toml b/Cargo.toml
index 2415337110..5045ee0d4d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,7 @@ aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.65"
+bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d6beb61369..a9cbed85fb 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.1
+ENV RUSTC_VERSION=1.81.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index 370d9e9a6f..d3e3ce648f 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
     fn include_file(&self, filename: &str) {
         // This does the equivalent of passing bindgen::CargoCallbacks
         // to the builder .parse_callbacks() method.
-        let cargo_callbacks = bindgen::CargoCallbacks;
+        let cargo_callbacks = bindgen::CargoCallbacks::new();
         cargo_callbacks.include_file(filename)
     }
 
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 7bb077062b..28547f52bf 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -4,7 +4,6 @@
 use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
-use bindgen::CargoCallbacks;
 
 fn main() -> anyhow::Result<()> {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
@@ -64,16 +63,25 @@ fn main() -> anyhow::Result<()> {
             .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
     };
 
+    let unwind_abi_functions = [
+        "log_internal",
+        "recovery_download",
+        "start_streaming",
+        "finish_sync_safekeepers",
+        "wait_event_set",
+        "WalProposerStart",
+    ];
+
     // The bindgen::Builder is the main entry point
     // to bindgen, and lets you build up options for
     // the resulting bindings.
-    let bindings = bindgen::Builder::default()
+    let mut builder = bindgen::Builder::default()
         // The input header we would like to generate
         // bindings for.
         .header("bindgen_deps.h")
         // Tell cargo to invalidate the built crate whenever any of the
         // included header files changed.
-        .parse_callbacks(Box::new(CargoCallbacks))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .allowlist_type("WalProposer")
         .allowlist_type("WalProposerConfig")
         .allowlist_type("walproposer_api")
@@ -105,7 +113,12 @@ fn main() -> anyhow::Result<()> {
         .allowlist_var("WL_SOCKET_MASK")
         .clang_arg("-DWALPROPOSER_LIB")
         .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"))
+        .clang_arg(format!("-I{inc_server_path}"));
+
+    for name in unwind_abi_functions {
+        builder = builder.override_abi(bindgen::Abi::CUnwind, name);
+    }
+    let bindings = builder
         // Finish the builder and generate the bindings.
         .generate()
         // Unwrap the Result and panic on failure.
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index bbc3663402..2fbea3fe45 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
     }
 }
 
-extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
     }
 }
 
-extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
     }
 }
 
-extern "C" fn wait_event_set(
+extern "C-unwind" fn wait_event_set(
     wp: *mut WalProposer,
     timeout: ::std::os::raw::c_long,
     event_sk: *mut *mut Safekeeper,
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
     }
 }
 
-extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
     }
 }
 
-extern "C" fn log_internal(
+extern "C-unwind" fn log_internal(
     wp: *mut WalProposer,
     level: ::std::os::raw::c_int,
     line: *const ::std::os::raw::c_char,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d9725ad756..9fbe2f0da5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst(
         cancel,
     )
     .await
-    .map_err(|e| {
+    .inspect_err(|_e| {
         // Do a best-effort attempt at deleting the temporary file upon encountering an error.
         // We don't have async here nor do we want to pile on any extra errors.
         if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst(
                 warn!("error deleting temporary file {temp_path}: {e}");
             }
         }
-        e
     })?;
 
     Ok((temp_path, file))
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 33eda72e65..b004bf4ecf 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,10 +38,7 @@ impl Api {
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
-            Ok(v) => v,
-            Err(_) => String::new(),
-        };
+        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
         Self {
             endpoint,
             caches,
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 368b8d300a..e78c4d6790 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.1"
+channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 90b1604adb..6d677f405a 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -758,9 +758,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
                 // pq_sendint32(&reply_message, xmin);
                 // pq_sendint32(&reply_message, xmin_epoch);
                 // So it is two big endian 32-bit words in low endian order!
-                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
-                hs_feedback.catalog_xmin =
-                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
+                hs_feedback.xmin = hs_feedback.xmin.rotate_left(32);
+                hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32);
                 self.ws_guard
                     .walsenders
                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 20693ad63d..3d2fa8c214 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -47,7 +47,8 @@ hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -101,7 +102,8 @@ either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }

From e86fef05ddbc276170ec29d035d86d03e3ad4ec2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Sep 2024 13:11:17 +0100
Subject: [PATCH 101/159] storcon: track preferred AZ for each tenant shard
 (#8937)

## Problem
We want to do AZ aware scheduling, but don't have enough metadata.

## Summary of changes
Introduce a `preferred_az_id` concept for each managed tenant shard.

In a future PR, the scheduler will use this as a soft preference.
The idea is to try and keep the shard attachments within the same AZ.
Under the assumption that the compute was placed in the correct AZ,
this reduces the chances of cross AZ trafic from between compute and PS.

In terms of code changes we:
1. Add a new nullable `preferred_az_id` column to the `tenant_shards`
table. Also include an in-memory counterpart.
2. Populate the preferred az on tenant creation and shard splits.
3. Add an endpoint which allows to bulk-set preferred AZs.

(3) gives us the migration path. I'll write a script which queries the
cplane db in the region and sets the preferred az of all shards with an
active compute to the AZ of said compute. For shards without an active compute,
I'll use the AZ of the currently attached pageserver
since this is what cplane uses now to schedule computes.
---
 libs/pageserver_api/src/controller_api.rs     |  15 +-
 .../down.sql                                  |   1 +
 .../up.sql                                    |   1 +
 storage_controller/src/http.rs                |  21 +-
 storage_controller/src/persistence.rs         |  33 ++
 storage_controller/src/schema.rs              |   1 +
 storage_controller/src/service.rs             | 327 +++++++++++++-----
 storage_controller/src/tenant_shard.rs        |  15 +
 test_runner/fixtures/neon_fixtures.py         |  13 +-
 .../regress/test_storage_controller.py        |  52 +++
 10 files changed, 384 insertions(+), 95 deletions(-)
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 6fb5a9a139..94104af002 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -74,6 +74,17 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsRequest {
+    #[serde(flatten)]
+    pub preferred_az_ids: HashMap<TenantShardId, String>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsResponse {
+    pub updated: Vec<TenantShardId>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
     pub shard_id: TenantShardId,
@@ -132,6 +143,8 @@ pub struct TenantDescribeResponseShard {
     pub is_splitting: bool,
 
     pub scheduling_policy: ShardSchedulingPolicy,
+
+    pub preferred_az_id: Option<String>,
 }
 
 /// Explicitly migrating a particular shard is a low level operation
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
new file mode 100644
index 0000000000..127972a2e4
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards DROP preferred_az_id;
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
new file mode 100644
index 0000000000..641a54feb2
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 32882c201a..5d4d0460be 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -14,7 +14,7 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
     MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    TenantCreateRequest,
+    ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
@@ -688,6 +688,18 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
+async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state.service.update_shards_preferred_azs(azs_req).await?,
+    )
+}
+
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1174,6 +1186,13 @@ pub fn make_router(
                 RequestName("control_v1_tenant_policy"),
             )
         })
+        .put("/control/v1/preferred_azs", |r| {
+            named_request_span(
+                r,
+                handle_update_preferred_azs,
+                RequestName("control_v1_preferred_azs"),
+            )
+        })
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 6df05ebd13..1dc1040d96 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,6 +105,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
+    SetPreferredAzs,
 }
 
 #[must_use]
@@ -664,6 +665,33 @@ impl Persistence {
         Ok(())
     }
 
+    pub(crate) async fn set_tenant_shard_preferred_azs(
+        &self,
+        preferred_azs: Vec<(TenantShardId, String)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, String)>> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
+            let mut shards_updated = Vec::default();
+
+            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set(preferred_az_id.eq(preferred_az))
+                    .execute(conn)?;
+
+                if updated == 1 {
+                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                }
+            }
+
+            Ok(shards_updated)
+        })
+        .await
+    }
+
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
@@ -1050,6 +1078,11 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) config: String,
     #[serde(default)]
     pub(crate) scheduling_policy: String,
+
+    // Hint that we should attempt to schedule this tenant shard the given
+    // availability zone in order to minimise the chances of cross-AZ communication
+    // with compute.
+    pub(crate) preferred_az_id: Option<String>,
 }
 
 impl TenantShardPersistence {
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 93ab774b5f..1717a9369d 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -41,6 +41,7 @@ diesel::table! {
         splitting -> Int2,
         config -> Text,
         scheduling_policy -> Varchar,
+        preferred_az_id -> Nullable<Varchar>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2911cd5ac4..324f864291 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -25,7 +25,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
         ScheduleOptimizationAction,
@@ -41,10 +41,11 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -353,6 +354,12 @@ impl From<DatabaseError> for ApiError {
     }
 }
 
+enum InitialShardScheduleOutcome {
+    Scheduled(TenantCreateResponseShard),
+    NotScheduled,
+    ShardScheduleError(ScheduleError),
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
@@ -1452,6 +1459,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             };
 
             match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -2023,6 +2031,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             })
             .collect();
 
@@ -2046,99 +2055,87 @@ impl Service {
         };
 
         let mut schedule_context = ScheduleContext::default();
+        let mut schedule_error = None;
+        let mut response_shards = Vec::new();
+        for tenant_shard_id in create_ids {
+            tracing::info!("Creating shard {tenant_shard_id}...");
 
-        let (waiters, response_shards) = {
+            let outcome = self
+                .do_initial_shard_scheduling(
+                    tenant_shard_id,
+                    initial_generation,
+                    &create_req.shard_parameters,
+                    create_req.config.clone(),
+                    placement_policy.clone(),
+                    &mut schedule_context,
+                )
+                .await;
+
+            match outcome {
+                InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp),
+                InitialShardScheduleOutcome::NotScheduled => {}
+                InitialShardScheduleOutcome::ShardScheduleError(err) => {
+                    schedule_error = Some(err);
+                }
+            }
+        }
+
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            response_shards
+                .iter()
+                .filter_map(|resp| {
+                    let az_id = locked
+                        .nodes
+                        .get(&resp.node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((resp.shard_id, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        // Note that we persist the preferred AZ for the new shards separately.
+        // In theory, we could "peek" the scheduler to determine where the shard will
+        // land, but the subsequent "real" call into the scheduler might select a different
+        // node. Hence, we do this awkward update to keep things consistent.
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            })?;
+
+        {
             let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
-
-            let mut response_shards = Vec::new();
-            let mut schcedule_error = None;
-
-            for tenant_shard_id in create_ids {
-                tracing::info!("Creating shard {tenant_shard_id}...");
-
-                use std::collections::btree_map::Entry;
-                match tenants.entry(tenant_shard_id) {
-                    Entry::Occupied(mut entry) => {
-                        tracing::info!(
-                            "Tenant shard {tenant_shard_id} already exists while creating"
-                        );
-
-                        // TODO: schedule() should take an anti-affinity expression that pushes
-                        // attached and secondary locations (independently) away frorm those
-                        // pageservers also holding a shard for this tenant.
-
-                        entry
-                            .get_mut()
-                            .schedule(scheduler, &mut schedule_context)
-                            .map_err(|e| {
-                                ApiError::Conflict(format!(
-                                    "Failed to schedule shard {tenant_shard_id}: {e}"
-                                ))
-                            })?;
-
-                        if let Some(node_id) = entry.get().intent.get_attached() {
-                            let generation = entry
-                                .get()
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-
-                        continue;
-                    }
-                    Entry::Vacant(entry) => {
-                        let state = entry.insert(TenantShard::new(
-                            tenant_shard_id,
-                            ShardIdentity::from_params(
-                                tenant_shard_id.shard_number,
-                                &create_req.shard_parameters,
-                            ),
-                            placement_policy.clone(),
-                        ));
-
-                        state.generation = initial_generation;
-                        state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
-                            schcedule_error = Some(e);
-                        }
-
-                        // Only include shards in result if we are attaching: the purpose
-                        // of the response is to tell the caller where the shards are attached.
-                        if let Some(node_id) = state.intent.get_attached() {
-                            let generation = state
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-                    }
-                };
+            for (tid, az_id) in updated {
+                if let Some(shard) = locked.tenants.get_mut(&tid) {
+                    shard.set_preferred_az(az_id);
+                }
             }
+        }
 
-            // If we failed to schedule shards, then they are still created in the controller,
-            // but we return an error to the requester to avoid a silent failure when someone
-            // tries to e.g. create a tenant whose placement policy requires more nodes than
-            // are present in the system.  We do this here rather than in the above loop, to
-            // avoid situations where we only create a subset of shards in the tenant.
-            if let Some(e) = schcedule_error {
-                return Err(ApiError::Conflict(format!(
-                    "Failed to schedule shard(s): {e}"
-                )));
-            }
+        // If we failed to schedule shards, then they are still created in the controller,
+        // but we return an error to the requester to avoid a silent failure when someone
+        // tries to e.g. create a tenant whose placement policy requires more nodes than
+        // are present in the system.  We do this here rather than in the above loop, to
+        // avoid situations where we only create a subset of shards in the tenant.
+        if let Some(e) = schedule_error {
+            return Err(ApiError::Conflict(format!(
+                "Failed to schedule shard(s): {e}"
+            )));
+        }
 
-            let waiters = tenants
+        let waiters = {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
                 .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
-                .collect::<Vec<_>>();
-            (waiters, response_shards)
+                .collect::<Vec<_>>()
         };
 
         Ok((
@@ -2149,6 +2146,78 @@ impl Service {
         ))
     }
 
+    /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
+    /// case of a new tenant and a pre-existing one.
+    async fn do_initial_shard_scheduling(
+        &self,
+        tenant_shard_id: TenantShardId,
+        initial_generation: Option<Generation>,
+        shard_params: &ShardParameters,
+        config: TenantConfig,
+        placement_policy: PlacementPolicy,
+        schedule_context: &mut ScheduleContext,
+    ) -> InitialShardScheduleOutcome {
+        let mut locked = self.inner.write().unwrap();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        use std::collections::btree_map::Entry;
+        match tenants.entry(tenant_shard_id) {
+            Entry::Occupied(mut entry) => {
+                tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
+
+                // TODO: schedule() should take an anti-affinity expression that pushes
+                // attached and secondary locations (independently) away frorm those
+                // pageservers also holding a shard for this tenant.
+
+                if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(err);
+                }
+
+                if let Some(node_id) = entry.get().intent.get_attached() {
+                    let generation = entry
+                        .get()
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+            Entry::Vacant(entry) => {
+                let state = entry.insert(TenantShard::new(
+                    tenant_shard_id,
+                    ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
+                    placement_policy,
+                ));
+
+                state.generation = initial_generation;
+                state.config = config;
+                if let Err(e) = state.schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(e);
+                }
+
+                // Only include shards in result if we are attaching: the purpose
+                // of the response is to tell the caller where the shards are attached.
+                if let Some(node_id) = state.intent.get_attached() {
+                    let generation = state
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+        }
+    }
+
     /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
     /// wait for reconciliation to complete before responding.
     async fn await_waiters(
@@ -3511,6 +3580,7 @@ impl Service {
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
                 scheduling_policy: *shard.get_scheduling_policy(),
+                preferred_az_id: shard.preferred_az().map(ToString::to_string),
             })
         }
 
@@ -4214,9 +4284,10 @@ impl Service {
                     config: serde_json::to_string(&config).unwrap(),
                     splitting: SplitState::Splitting,
 
-                    // Scheduling policies do not carry through to children
+                    // Scheduling policies and preferred AZ do not carry through to children
                     scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                         .unwrap(),
+                    preferred_az_id: None,
                 });
             }
 
@@ -4336,6 +4407,47 @@ impl Service {
         let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
+        // Now that we have scheduled the child shards, attempt to set their preferred AZ
+        // to that of the pageserver they've been attached on.
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            child_locations
+                .iter()
+                .filter_map(|(tid, node_id, _stripe_size)| {
+                    let az_id = locked
+                        .nodes
+                        .get(node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((*tid, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            });
+
+        match updated {
+            Ok(updated) => {
+                let mut locked = self.inner.write().unwrap();
+                for (tid, az_id) in updated {
+                    if let Some(shard) = locked.tenants.get_mut(&tid) {
+                        shard.set_preferred_az(az_id);
+                    }
+                }
+            }
+            Err(err) => {
+                tracing::warn!("Failed to persist preferred AZs after split: {err}");
+            }
+        }
+
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
@@ -6497,4 +6609,35 @@ impl Service {
     ) -> Result<(), DatabaseError> {
         self.persistence.safekeeper_upsert(record).await
     }
+
+    pub(crate) async fn update_shards_preferred_azs(
+        &self,
+        req: ShardsPreferredAzsRequest,
+    ) -> Result<ShardsPreferredAzsResponse, ApiError> {
+        let preferred_azs = req.preferred_az_ids.into_iter().collect::<Vec<_>>();
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred AZs: {err}"
+                ))
+            })?;
+
+        let mut updated_in_mem_and_db = Vec::default();
+
+        let mut locked = self.inner.write().unwrap();
+        for (tid, az_id) in updated {
+            let shard = locked.tenants.get_mut(&tid);
+            if let Some(shard) = shard {
+                shard.set_preferred_az(az_id);
+                updated_in_mem_and_db.push(tid);
+            }
+        }
+
+        Ok(ShardsPreferredAzsResponse {
+            updated: updated_in_mem_and_db,
+        })
+    }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 30723a3b36..cdb0633e2b 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -140,6 +140,10 @@ pub(crate) struct TenantShard {
     // Support/debug tool: if something is going wrong or flapping with scheduling, this may
     // be set to a non-active state to avoid making changes while the issue is fixed.
     scheduling_policy: ShardSchedulingPolicy,
+
+    // We should attempt to schedule this shard in the provided AZ to
+    // decrease chances of cross-AZ compute.
+    preferred_az_id: Option<String>,
 }
 
 #[derive(Default, Clone, Debug, Serialize)]
@@ -463,6 +467,7 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
+            preferred_az_id: None,
         }
     }
 
@@ -1297,6 +1302,7 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+            preferred_az_id: tsp.preferred_az_id,
         })
     }
 
@@ -1312,8 +1318,17 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
+            preferred_az_id: self.preferred_az_id.clone(),
         }
     }
+
+    pub(crate) fn preferred_az(&self) -> Option<&str> {
+        self.preferred_az_id.as_deref()
+    }
+
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
+        self.preferred_az_id = Some(preferred_az_id);
+    }
 }
 
 #[cfg(test)]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c692ceb69..18fbbde637 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2560,7 +2560,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def tenant_describe(self, tenant_id: TenantId):
         """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}
         """
         response = self.request(
             "GET",
@@ -2886,6 +2886,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 return None
             raise e
 
+    def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
+        response = self.request(
+            "PUT",
+            f"{self.api}/control/v1/preferred_azs",
+            headers=self.headers(TokenScope.ADMIN),
+            json={str(tid): az for tid, az in preferred_azs.items()},
+        )
+
+        response.raise_for_status()
+        return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8da42294b0..92cd74eba5 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2512,3 +2512,55 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
                 del d[key]
 
     return compared[0] == compared[1]
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
+    def assign_az(ps_cfg):
+        az = f"az-{ps_cfg['id']}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tids = [TenantId.generate() for _ in range(0, 3)]
+    for tid in tids:
+        env.storage_controller.tenant_create(tid)
+
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        attached_to = shards[0]["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+
+        assert shards[0]["preferred_az_id"] == expected_az
+
+    updated = env.storage_controller.set_preferred_azs(
+        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
+    )
+
+    assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
+
+    for tid in tids:
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        assert shards[0]["preferred_az_id"] == "foo"
+
+    # Generate a layer to avoid shard split handling on ps from tripping
+    # up on debug assert.
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_timeline("bar", tids[0], timeline_id)
+
+    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
+    env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
+    shards = env.storage_controller.tenant_describe(tids[0])["shards"]
+    assert len(shards) == 2
+    for shard in shards:
+        attached_to = shard["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == expected_az

From cbcd4058edb7a2c2bb3bfe1a6fc1ffb0d820b870 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:33:52 +0200
Subject: [PATCH 102/159] Fix 1.82 clippy lint too_long_first_doc_paragraph
 (#8941)

Addresses the 1.82 beta clippy lint `too_long_first_doc_paragraph` by
adding newlines to the first sentence if it is short enough, and making
a short first sentence if there is the need.
---
 compute_tools/src/pg_helpers.rs                    |  7 ++++---
 libs/metrics/src/lib.rs                            |  1 +
 libs/pageserver_api/src/controller_api.rs          |  2 ++
 libs/pageserver_api/src/models.rs                  | 10 +++++++---
 libs/postgres_backend/src/lib.rs                   |  6 ++++--
 libs/postgres_connection/src/lib.rs                |  1 +
 libs/remote_storage/src/lib.rs                     |  6 +++++-
 libs/tenant_size_model/src/lib.rs                  |  7 ++++---
 libs/utils/src/circuit_breaker.rs                  |  6 ++++--
 libs/utils/src/id.rs                               |  6 ++++--
 libs/utils/src/lock_file.rs                        |  4 +++-
 libs/utils/src/pageserver_feedback.rs              |  1 +
 libs/utils/src/poison.rs                           |  2 ++
 libs/utils/src/shard.rs                            |  9 +++++----
 libs/utils/src/simple_rcu.rs                       |  7 +++----
 libs/utils/src/sync/heavier_once_cell.rs           |  4 +++-
 libs/utils/src/vec_map.rs                          |  1 +
 libs/utils/src/yielding_loop.rs                    |  7 ++++---
 pageserver/src/config.rs                           |  2 ++
 pageserver/src/context.rs                          | 10 ++++++----
 pageserver/src/pgdatadir_mapping.rs                |  8 +++++---
 pageserver/src/tenant.rs                           |  9 +++++----
 pageserver/src/tenant/metadata.rs                  |  9 +++++----
 pageserver/src/tenant/mgr.rs                       | 12 +++++++-----
 pageserver/src/tenant/remote_timeline_client.rs    |  2 ++
 .../src/tenant/remote_timeline_client/index.rs     |  1 +
 pageserver/src/tenant/storage_layer.rs             |  9 +++++----
 pageserver/src/tenant/storage_layer/delta_layer.rs |  9 +++++----
 pageserver/src/tenant/storage_layer/image_layer.rs |  8 +++++---
 pageserver/src/tenant/storage_layer/layer_desc.rs  |  6 ++++--
 pageserver/src/tenant/storage_layer/layer_name.rs  |  5 +++--
 .../src/tenant/storage_layer/merge_iterator.rs     |  8 +++++---
 .../src/tenant/storage_layer/split_writer.rs       | 14 ++++++++------
 pageserver/src/tenant/vectored_blob_io.rs          |  6 ++++--
 pageserver/src/virtual_file.rs                     |  5 +++--
 pageserver/src/walredo.rs                          | 11 +++++------
 proxy/src/stream.rs                                |  1 +
 safekeeper/src/pull_timeline.rs                    |  1 +
 safekeeper/src/receive_wal.rs                      |  6 ++++--
 safekeeper/src/state.rs                            |  8 +++++---
 safekeeper/src/timeline.rs                         |  1 +
 safekeeper/src/timeline_eviction.rs                |  8 +++++---
 safekeeper/src/timeline_guard.rs                   |  4 +++-
 safekeeper/src/timeline_manager.rs                 |  1 +
 safekeeper/src/timelines_set.rs                    |  3 ++-
 safekeeper/src/wal_backup_partial.rs               |  6 ++++--
 safekeeper/src/wal_service.rs                      |  1 +
 storage_controller/src/service.rs                  |  4 +++-
 storage_scrubber/src/garbage.rs                    |  7 ++++---
 storage_scrubber/src/metadata_stream.rs            |  4 +++-
 storage_scrubber/src/pageserver_physical_gc.rs     |  7 ++++---
 51 files changed, 180 insertions(+), 103 deletions(-)

diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 863fa9468f..b2dc265864 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
-/// Escape a string for including it in a SQL literal. Wrapping the result
-/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
-/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// Escape a string for including it in a SQL literal.
+///
+/// Wrapping the result with `E'{}'` or `'{}'` is not required,
+/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
 /// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index df000cd0fb..cd4526c089 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -68,6 +68,7 @@ macro_rules! register_uint_gauge {
 static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
+///
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
 pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 94104af002..5c8dcbf571 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -147,6 +147,8 @@ pub struct TenantDescribeResponseShard {
     pub preferred_az_id: Option<String>,
 }
 
+/// Migration request for a given tenant shard to a given node.
+///
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d13d04eb1b..ffe79c8350 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,8 +305,10 @@ pub struct TenantConfig {
     pub lsn_lease_length_for_ts: Option<String>,
 }
 
-/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
-/// tenant config. When the first aux file written, the policy will be persisted in the
+/// The policy for the aux file storage.
+///
+/// It can be switched through `switch_aux_file_policy` tenant config.
+/// When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
@@ -896,7 +898,9 @@ pub struct WalRedoManagerStatus {
     pub process: Option<WalRedoManagerProcessStatus>,
 }
 
-/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
+/// The progress of a secondary tenant.
+///
+/// It is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 7c7c6535b3..600f1d728c 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -69,8 +69,10 @@ impl QueryError {
 }
 
 /// Returns true if the given error is a normal consequence of a network issue,
-/// or the client closing the connection. These errors can happen during normal
-/// operations, and don't indicate a bug in our code.
+/// or the client closing the connection.
+///
+/// These errors can happen during normal operations,
+/// and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
     use io::ErrorKind::*;
     matches!(
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 9f57f3d507..ddf9f7b610 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -7,6 +7,7 @@ use std::fmt;
 use url::Host;
 
 /// Parses a string of format either `host:port` or `host` into a corresponding pair.
+///
 /// The `host` part should be a correct `url::Host`, while `port` (if present) should be
 /// a valid decimal u16 of digits only.
 pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index cc1d3e0ae4..b5b69c9faf 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -45,6 +45,8 @@ pub use azure_core::Etag;
 
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 
+/// Default concurrency limit for S3 operations
+///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     ) -> Result<(), TimeTravelError>;
 }
 
-/// DownloadStream is sensitive to the timeout and cancellation used with the original
+/// Data part of an ongoing [`Download`].
+///
+/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
 /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
 /// with `tokio::io::copy_buf`.
 // This has 'static because safekeepers do not use cancellation tokens (yet)
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index a3e12cf0e3..974a498404 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -5,9 +5,10 @@
 mod calculation;
 pub mod svg;
 
-/// StorageModel is the input to the synthetic size calculation. It represents
-/// a tree of timelines, with just the information that's needed for the
-/// calculation. This doesn't track timeline names or where each timeline
+/// StorageModel is the input to the synthetic size calculation.
+///
+/// It represents a tree of timelines, with just the information that's needed
+/// for the calculation. This doesn't track timeline names or where each timeline
 /// begins and ends, for example. Instead, it consists of "points of interest"
 /// on the timelines. A point of interest could be the timeline start or end point,
 /// the oldest point on a timeline that needs to be retained because of PITR
diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs
index 720ea39d4f..e1ddfd8650 100644
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -5,8 +5,10 @@ use std::{
 
 use metrics::IntCounter;
 
-/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
-/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// Circuit breakers are for operations that are expensive and fallible.
+///
+/// If a circuit breaker fails repeatedly, we will stop attempting it for some
+/// period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
     /// An identifier that enables us to log useful errors when a circuit is broken
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index db468e3054..2cda899b15 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -249,8 +249,10 @@ macro_rules! id_newtype {
     };
 }
 
-/// Neon timeline IDs are different from PostgreSQL timeline
-/// IDs. They serve a similar purpose though: they differentiate
+/// Neon timeline ID.
+///
+/// They are different from PostgreSQL timeline
+/// IDs, but serve a similar purpose: they differentiate
 /// between different "histories" of the same cluster.  However,
 /// PostgreSQL timeline IDs are a bit cumbersome, because they are only
 /// 32-bits wide, and they must be in ascending order in any given
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 59c66ca757..3a2ed3e830 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -100,7 +100,9 @@ pub enum LockFileRead {
 }
 
 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
-/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// inspect its content.
+///
+/// It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
     let res = fs::OpenOptions::new().read(true).open(path);
diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index 3ddfa44f41..dede65e699 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -8,6 +8,7 @@ use tracing::{trace, warn};
 use crate::lsn::Lsn;
 
 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+///
 /// Serialized in custom flexible key/value format. In replication protocol, it
 /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
 /// Standby status update / Hot standby feedback messages.
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index 27378c69fc..c3e2fba20c 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -65,6 +65,8 @@ impl<T> Poison<T> {
     }
 }
 
+/// Armed pointer to a [`Poison`].
+///
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index f6b430657e..d146010b41 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);
 
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
+/// Combination of ShardNumber and ShardCount.
+///
+/// For use within the context of a particular tenant, when we need to know which shard we're
+/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
+/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
     pub shard_number: ShardNumber,
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index ecc5353be3..01750b2aef 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard};
 
 use tokio::sync::watch;
 
-///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
-/// (for very long).  Storing to the Rcu updates the value, making new readers
-/// immediately see the new value, but it also waits for all current readers to
-/// finish.
+/// (for very long).
 ///
+/// Storing to the Rcu updates the value, making new readers immediately see
+/// the new value, but it also waits for all current readers to finish.
 pub struct Rcu<V> {
     inner: RwLock<RcuInner<V>>,
 }
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 1abd3d9861..dc711fb028 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -5,7 +5,9 @@ use std::sync::{
 use tokio::sync::Semaphore;
 
 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// `SemaphorePermit`.
+///
+/// Allows use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 18b2af14f1..5f0028bacd 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -7,6 +7,7 @@ pub enum VecMapOrdering {
 }
 
 /// Ordered map datastructure implemented in a Vec.
+///
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 41c4cee45d..68274f0631 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -6,9 +6,10 @@ pub enum YieldingLoopError {
     Cancelled,
 }
 
-/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
-/// yields to avoid blocking the executor, and after resuming checks the provided
-/// cancellation token to drop out promptly on shutdown.
+/// Helper for long synchronous loops, e.g. over all tenants in the system.
+///
+/// Periodically yields to avoid blocking the executor, and after resuming
+/// checks the provided cancellation token to drop out promptly on shutdown.
 #[inline(always)]
 pub async fn yielding_loop<I, T, F>(
     interval: usize,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4e68e276d3..29a98855d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -180,6 +180,8 @@ pub struct PageServerConf {
     pub io_buffer_alignment: usize,
 }
 
+/// Token for authentication to safekeepers
+///
 /// We do not want to store this in a PageServerConf because the latter may be logged
 /// and/or serialized at a whim, while the token is secret. Currently this token is the
 /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 012cb8d96f..7afcf52cf2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,7 +1,9 @@
-//! This module defines `RequestContext`, a structure that we use throughout
-//! the pageserver to propagate high-level context from places
-//! that _originate_ activity down to the shared code paths at the
-//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//! Defines [`RequestContext`].
+//!
+//! It is a structure that we use throughout the pageserver to propagate
+//! high-level context from places that _originate_ activity down to the
+//! shared code paths at the heart of the pageserver. It's inspired by
+//! Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d28a214265..808d4b666e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1021,9 +1021,10 @@ impl Timeline {
 }
 
 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository. It is created by the 'begin_record'
-/// function. It is called for each WAL record, so that all the modifications
-/// by a one WAL record appear atomic.
+/// updates to the repository.
+///
+/// It is created by the 'begin_record' function. It is called for each WAL
+/// record, so that all the modifications by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
     /// The timeline this modification applies to. You can access this to
     /// read the state, but note that any pending updates are *not* reflected
@@ -2048,6 +2049,7 @@ impl<'a> DatadirModification<'a> {
 
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
+///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fb30857ddf..fd2520a42e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,8 +1,9 @@
+//! Timeline repository implementation that keeps old data in layer files, and
+//! the recent changes in ephemeral files.
 //!
-//! Timeline repository implementation that keeps old data in files on disk, and
-//! the recent changes in memory. See tenant/*_layer.rs files.
-//! The functions here are responsible for locating the correct layer for the
-//! get/put call, walking back the timeline branching history as needed.
+//! See tenant/*_layer.rs files. The functions here are responsible for locating
+//! the correct layer for the get/put call, walking back the timeline branching
+//! history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 190316df42..24440d4b35 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,7 +1,8 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
-//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
-//! this struct and it's original serialization format is still needed because they were written a
-//! long time ago.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata.
+//!
+//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
+//! their timelines, this struct and its original serialization format is still needed because
+//! they were written a long time ago.
 //!
 //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
 //! versioning.
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4e6ea0c8f9..2104f41531 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -282,9 +282,10 @@ impl BackgroundPurges {
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
-/// The TenantManager is responsible for storing and mutating the collection of all tenants
-/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
-/// lives inside the TenantManager.
+/// Responsible for storing and mutating the collection of all tenants
+/// that this pageserver has state for.
+///
+/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
 ///
 /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
 /// the same tenant twice concurrently, or trying to configure the same tenant into secondary
@@ -2346,8 +2347,9 @@ pub enum TenantMapError {
     ShuttingDown,
 }
 
-/// Guards a particular tenant_id's content in the TenantsMap.  While this
-/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
+/// Guards a particular tenant_id's content in the TenantsMap.
+///
+/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 71b766e4c7..1f9ae40af5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2184,6 +2184,8 @@ pub fn remote_timeline_path(
     remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }
 
+/// Obtains the path of the given Layer in the remote
+///
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 757fb9d032..c51ff54919 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -1,4 +1,5 @@
 //! In-memory index to track the tenant files on the remote storage.
+//!
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index a1202ad507..dac6b2f893 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -434,10 +434,11 @@ impl ReadableLayer {
     }
 }
 
-/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
-/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
-/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
-/// be used for cache management but not for correctness-critical checks.
+/// Layers contain a hint indicating whether they are likely to be used for reads.
+///
+/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
+/// when changing the visibility of layers (for example when creating a branch that makes some previously
+/// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LayerVisibilityHint {
     /// A Visible layer might be read while serving a read, because there is not an image layer between it
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6a2cd94232..34f1b15138 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -136,10 +136,11 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;
 
-/// Struct representing reference to BLOB in layers. Reference contains BLOB
-/// offset, and for WAL records it also contains `will_init` flag. The flag
-/// helps to determine the range of records that needs to be applied, without
-/// reading/deserializing records themselves.
+/// Struct representing reference to BLOB in layers.
+///
+/// Reference contains BLOB offset, and for WAL records it also contains
+/// `will_init` flag. The flag helps to determine the range of records
+/// that needs to be applied, without reading/deserializing records themselves.
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 77ce1ae670..875e223c9c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1,7 +1,9 @@
 //! An ImageLayer represents an image or a snapshot of a key-range at
-//! one particular LSN. It contains an image of all key-value pairs
-//! in its key-range. Any key that falls into the image layer's range
-//! but does not exist in the layer, does not exist.
+//! one particular LSN.
+//!
+//! It contains an image of all key-value pairs in its key-range. Any key
+//! that falls into the image layer's range but does not exist in the layer,
+//! does not exist.
 //!
 //! An image layer is stored in a file on disk. The file is stored in
 //! timelines/<timeline_id> directory.  Currently, there are no
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index cbd18e650f..e90ff3c4b2 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;
 
-/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
-/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// A unique identifier of a persistent layer.
+///
+/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
+/// This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 47ae556279..ffe7ca5f3e 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName {
     }
 }
 
-/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
-/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
+///
+/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
 /// over time (e.g. across shard splits or compression). The physical filenames of layers in local
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index d2c341e5ce..0831fd9530 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -226,9 +226,11 @@ impl<'a> IteratorWrapper<'a> {
     }
 }
 
-/// A merge iterator over delta/image layer iterators. When duplicated records are
-/// found, the iterator will not perform any deduplication, and the caller should handle
-/// these situation. By saying duplicated records, there are many possibilities:
+/// A merge iterator over delta/image layer iterators.
+///
+/// When duplicated records are found, the iterator will not perform any
+/// deduplication, and the caller should handle these situation. By saying
+/// duplicated records, there are many possibilities:
 ///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index e8deb0a1e5..7c1ac863bf 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -34,9 +34,10 @@ impl SplitWriterResult {
     }
 }
 
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
+/// An image writer that takes images and produces multiple image layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the image layer generation
+/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
@@ -193,9 +194,10 @@ impl SplitImageLayerWriter {
     }
 }
 
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
+/// A delta writer that takes key-lsn-values and produces multiple delta layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
+/// there might be leftover files to be cleaned up).
 ///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 4d51dc442d..553edf6d8b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -593,8 +593,10 @@ impl<'a> VectoredBlobReader<'a> {
     }
 }
 
-/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+///
+/// It provides a streaming API for getting read blobs. It returns a batch when
+/// `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<VectoredReadBuilder>,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index ed6ff86c10..57856eea80 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,6 +1,7 @@
-//!
 //! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor. Instead, the file is opened when it's read from,
+//! a file descriptor.
+//!
+//! Instead, the file is opened when it's read from,
 //! and if too many files are open globally in the system, least-recently
 //! used ones are closed.
 //!
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 82585f9ed8..a36955fa21 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,13 +43,12 @@ use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;
 
+/// The real implementation that uses a Postgres process to
+/// perform WAL replay.
 ///
-/// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. Only one thread can use the process at a time,
-/// that is controlled by the Mutex. In the future, we might want to
-/// launch a pool of processes to allow concurrent replay of multiple
-/// records.
-///
+/// Only one thread can use the process at a time, that is controlled by the
+/// Mutex. In the future, we might want to launch a pool of processes to allow
+/// concurrent replay of multiple records.
 pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 332dc27787..c14dd18afe 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -14,6 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 
 /// Stream wrapper which implements libpq's protocol.
+///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
 /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
 /// to pass random malformed bytes through the connection).
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 600a6bd8f0..64585f5edc 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -484,6 +484,7 @@ pub async fn validate_temp_timeline(
 }
 
 /// Move timeline from a temp directory to the main storage, and load it to the global map.
+///
 /// This operation is done under a lock to prevent bugs if several concurrent requests are
 /// trying to load the same timeline. Note that it doesn't guard against creating the
 /// timeline with the same ttid, but no one should be doing this anyway.
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index ab8c76dc17..e35f806e90 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -448,8 +448,10 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
-/// replies to reply_tx; reading from socket and writing to disk in parallel is
-/// beneficial for performance, this struct provides writing to disk part.
+/// replies to reply_tx.
+///
+/// Reading from socket and writing to disk in parallel is beneficial for
+/// performance, this struct provides the writing to disk part.
 pub struct WalAcceptor {
     tli: WalResidentTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index dca6414082..97eeae3638 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -147,9 +147,11 @@ pub struct TimelineMemState {
     pub proposer_uuid: PgUuid,
 }
 
-/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
-/// when we update fields like commit_lsn which don't need immediate
-/// persistence. Provides transactional like API to atomically update the state.
+/// Safekeeper persistent state plus in memory layer.
+///
+/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
+/// which don't need immediate persistence. Provides transactional like API
+/// to atomically update the state.
 ///
 /// Implements Deref into *persistent* part.
 pub struct TimelineState<CTRL: control_file::Storage> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 95ee925e1a..6fd5de0ad6 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -169,6 +169,7 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
 }
 
 /// This structure is stored in shared state and represents the state of the timeline.
+///
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 5d0567575c..5aa4921a92 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -1,6 +1,8 @@
-//! Code related to evicting WAL files to remote storage. The actual upload is done by the
-//! partial WAL backup code. This file has code to delete and re-download WAL files,
-//! cross-validate with partial WAL backup if local file is still present.
+//! Code related to evicting WAL files to remote storage.
+//!
+//! The actual upload is done by the partial WAL backup code. This file has
+//! code to delete and re-download WAL files, cross-validate with partial WAL
+//! backup if local file is still present.
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index dbdf46412d..1ddac573d2 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -1,4 +1,6 @@
-//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
+//! Timeline residence guard
+//!
+//! It is needed to ensure that WAL segments are present on disk,
 //! as long as the code is holding the guard. This file implements guard logic, to issue
 //! and drop guards, and to notify the manager when the guard is dropped.
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index f997f48454..6be75479db 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -1,4 +1,5 @@
 //! The timeline manager task is responsible for managing the timeline's background tasks.
+//!
 //! It is spawned alongside each timeline and exits when the timeline is deleted.
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
index d6eea79f82..096e348295 100644
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -60,7 +60,8 @@ impl TimelinesSet {
     }
 }
 
-/// Guard is used to add or remove timeline from the set.
+/// Guard is used to add or remove timelines from the set.
+///
 /// If the timeline present in set, it will be removed from it on drop.
 /// Note: do not use more than one guard for the same timeline, it caches the presence state.
 /// It is designed to be used in the manager task only.
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4050a82fff..bddfca50e4 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -1,6 +1,8 @@
 //! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
-//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
-//! was changed), the segment will be uploaded to S3 in about 15 minutes.
+//! and `flush_lsn` updates.
+//!
+//! After the partial segment was updated (`flush_lsn` was changed), the segment
+//! will be uploaded to S3 within the configured `partial_backup_timeout`.
 //!
 //! The filename format for partial segments is
 //! `Segment_Term_Flush_Commit_skNN.partial`, where:
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 16f7748eb4..1ab54d4cce 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -17,6 +17,7 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
+///
 /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 324f864291..e7eae647df 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -117,7 +117,9 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
 /// How long a node may be unresponsive to heartbeats during start up before we declare it
-/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
+/// offline.
+///
+/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
 /// handling of the re-attach response may take a long time and blocks heartbeats from
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 3e22960f8d..d53611ed6e 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -1,6 +1,7 @@
-//! Functionality for finding and purging garbage, as in "garbage collection".  Garbage means
-//! S3 objects which are either not referenced by any metadata, or are referenced by a
-//! control plane tenant/timeline in a deleted state.
+//! Functionality for finding and purging garbage, as in "garbage collection".
+//!
+//! Garbage means S3 objects which are either not referenced by any metadata,
+//! or are referenced by a control plane tenant/timeline in a deleted state.
 
 use std::{
     collections::{HashMap, HashSet},
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 10d77937f1..f896cff2d5 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -74,7 +74,9 @@ pub async fn stream_tenant_shards<'a>(
 }
 
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
-/// using a listing. The listing is done before the stream is built, so that this
+/// using a listing.
+///
+/// The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
     remote_client: &'a GenericRemoteStorage,
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 88681e38c2..c96d9cad3b 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -440,9 +440,10 @@ async fn gc_ancestor(
     Ok(())
 }
 
-/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
-/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
-/// is about removing:
+/// Physical garbage collection: removing unused S3 objects.
+///
+/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
+/// (keys, layers).  This type of garbage collection is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
 /// - Index objects from historic generations

From e287f36a058221b7c804b4b0f440933962eb3deb Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 15:23:57 +0300
Subject: [PATCH 103/159] safekeeper: fix endpoint restart immediately after
 xlog switch.

Check that truncation point is not from the future by comparing it with
write_record_lsn, not write_lsn, and explain that xlog switch changes
their normal order.

ref https://github.com/neondatabase/neon/issues/8911
---
 safekeeper/src/safekeeper.rs             |  3 ++-
 safekeeper/src/wal_storage.rs            | 23 ++++++++++++++++++-----
 test_runner/regress/test_wal_acceptor.py | 18 ++++++++++++++++++
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index dbe0034de2..b3e006ab05 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -938,8 +938,9 @@ where
         }
 
         trace!(
-            "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
+            "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
             msg.wal_data.len(),
+            msg.h.begin_lsn,
             msg.h.end_lsn,
             msg.h.commit_lsn,
             msg.h.truncate_lsn,
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 89c2e98a94..c477fe5c7b 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -98,7 +98,19 @@ pub struct PhysicalStorage {
     /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
     write_lsn: Lsn,
 
-    /// The LSN of the last WAL record written to disk. Still can be not fully flushed.
+    /// The LSN of the last WAL record written to disk. Still can be not fully
+    /// flushed.
+    ///
+    /// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
+    /// switch ingest the reverse is true because we don't bump write_lsn up to
+    /// the next segment: WAL stream from the compute doesn't have the gap and
+    /// for simplicity / as a sanity check we disallow any non-sequential
+    /// writes, so write zeros as is.
+    ///
+    /// Similar effect is in theory possible due to LSN alignment: if record
+    /// ends at *2, decoder will report end lsn as *8 even though we haven't
+    /// written these zeros yet. In practice compute likely never sends
+    /// non-aligned chunks of data.
     write_record_lsn: Lsn,
 
     /// The LSN of the last WAL record flushed to disk.
@@ -440,11 +452,12 @@ impl Storage for PhysicalStorage {
             .with_label_values(&["truncate_wal"])
             .start_timer();
 
-        // Streaming must not create a hole, so truncate cannot be called on non-written lsn
-        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
+        // Streaming must not create a hole, so truncate cannot be called on
+        // non-written lsn.
+        if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
             bail!(
-                "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
-                self.write_lsn,
+                "truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
+                self.write_record_lsn,
                 end_pos
             );
         }
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3785651aed..5672e836ee 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1057,6 +1057,24 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
         endpoint.start()
 
 
+# Try restarting endpoint immediately after xlog switch.
+# https://github.com/neondatabase/neon/issues/8911
+def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start("main")
+
+    endpoint.safe_psql("create table t (i int)")
+
+    endpoint.safe_psql("SELECT pg_switch_wal()")
+
+    # we want immediate shutdown to have endpoint restart on xlog switch record,
+    # so prevent shutdown checkpoint.
+    endpoint.stop(mode="immediate")
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("SELECT 'works'")
+
+
 # Context manager which logs passed time on exit.
 class DurationLogger:
     def __init__(self, desc):

From af6f63617e7421fca62ad2bf7ebfe2f0de66a793 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 6 Sep 2024 17:13:30 +0200
Subject: [PATCH 104/159] proxy: clean up code and lints for 1.81 and 1.82
 (#8945)

---
 proxy/src/cache/timed_lru.rs          |  2 +-
 proxy/src/lib.rs                      | 25 +++++++++++++++----------
 proxy/src/scram/exchange.rs           |  2 ++
 proxy/src/serverless/sql_over_http.rs |  9 +++------
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 8bb482f7c6..5b08d74696 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -16,7 +16,7 @@ use tracing::debug;
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
 use hashlink::{linked_hash_map::RawEntryMut, LruCache};
 
-use super::{common::Cached, *};
+use super::{common::Cached, timed_lru, Cache};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 8d7e586b3d..923d6ae288 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -44,16 +44,14 @@
     clippy::items_after_statements,
 )]
 // List of temporarily allowed lints.
-// TODO: Switch to except() once stable with 1.81.
 // TODO: fix code and reduce list or move to permanent list above.
-#![allow(
+#![expect(
     clippy::cargo_common_metadata,
     clippy::cast_possible_truncation,
     clippy::cast_possible_wrap,
     clippy::cast_precision_loss,
     clippy::cast_sign_loss,
     clippy::doc_markdown,
-    clippy::implicit_hasher,
     clippy::inline_always,
     clippy::match_same_arms,
     clippy::match_wild_err_arm,
@@ -61,21 +59,28 @@
     clippy::missing_panics_doc,
     clippy::module_name_repetitions,
     clippy::needless_pass_by_value,
-    clippy::needless_raw_string_hashes,
     clippy::redundant_closure_for_method_calls,
-    clippy::return_self_not_must_use,
     clippy::similar_names,
     clippy::single_match_else,
     clippy::struct_excessive_bools,
     clippy::struct_field_names,
     clippy::too_many_lines,
-    clippy::unreadable_literal,
-    clippy::unused_async,
-    clippy::unused_self,
-    clippy::wildcard_imports
+    clippy::unused_self
+)]
+#![cfg_attr(
+    any(test, feature = "testing"),
+    allow(
+        clippy::needless_raw_string_hashes,
+        clippy::unreadable_literal,
+        clippy::unused_async,
+    )
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(unknown_lints, clippy::manual_inspect)]
+#![allow(
+    unknown_lints,
+    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
+    impl_trait_overcaptures,
+)]
 
 use std::{convert::Infallible, future::Future};
 
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 786cbcaa19..afb5604666 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -217,6 +217,7 @@ impl sasl::Mechanism for Exchange<'_> {
                         self.state = ExchangeState::SaltSent(sent);
                         Ok(Step::Continue(self, msg))
                     }
+                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                     Step::Success(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
@@ -224,6 +225,7 @@ impl sasl::Mechanism for Exchange<'_> {
             ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                     Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
+                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                     Step::Continue(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 5b36f5e91d..2188edc8c5 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -745,22 +745,20 @@ impl BatchQueryData {
             builder = builder.deferrable(true);
         }
 
-        let transaction = builder.start().await.map_err(|e| {
+        let transaction = builder.start().await.inspect_err(|_| {
             // if we cannot start a transaction, we should return immediately
             // and not return to the pool. connection is clearly broken
             discard.discard();
-            e
         })?;
 
         let json_output =
             match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                 Ok(json_output) => {
                     info!("commit");
-                    let status = transaction.commit().await.map_err(|e| {
+                    let status = transaction.commit().await.inspect_err(|_| {
                         // if we cannot commit - for now don't return connection to pool
                         // TODO: get a query status from the error
                         discard.discard();
-                        e
                     })?;
                     discard.check_idle(status);
                     json_output
@@ -776,11 +774,10 @@ impl BatchQueryData {
                 }
                 Err(err) => {
                     info!("rollback");
-                    let status = transaction.rollback().await.map_err(|e| {
+                    let status = transaction.rollback().await.inspect_err(|_| {
                         // if we cannot rollback - for now don't return connection to pool
                         // TODO: get a query status from the error
                         discard.discard();
-                        e
                     })?;
                     discard.check_idle(status);
                     return Err(err);

From 11cf16e3f363ce027e53b1834a77858d50daee0d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 19 Aug 2024 14:42:07 +0300
Subject: [PATCH 105/159] safekeeper: add term_bump endpoint.

When walproposer observes now higher term it restarts instead of
crashing whole compute with PANIC; this avoids compute crash after
term_bump call. After successfull election we're still checking
last_log_term of the highest given vote to ensure basebackup is good,
and PANIC otherwise.

It will be used for migration per
035-safekeeper-dynamic-membership-change.md
and
https://github.com/neondatabase/docs/pull/21

ref https://github.com/neondatabase/neon/issues/8700
---
 libs/safekeeper_api/src/models.rs        | 13 +++++++++
 pgxn/neon/walproposer.c                  | 24 ++++++++++-----
 safekeeper/src/auth.rs                   |  3 ++
 safekeeper/src/http/routes.rs            | 28 +++++++++++++++++-
 safekeeper/src/state.rs                  | 26 +++++++++++++++--
 safekeeper/src/timeline.rs               | 10 +++++++
 test_runner/fixtures/safekeeper/http.py  | 29 +++++++++++++++++++
 test_runner/regress/test_wal_acceptor.py | 37 ++++++++++++++++++++++++
 8 files changed, 159 insertions(+), 11 deletions(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 2fbc333075..28666d197a 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -60,3 +60,16 @@ pub struct TimelineCopyRequest {
     pub target_timeline_id: TimelineId,
     pub until_lsn: Lsn,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpRequest {
+    /// bump to
+    pub term: Option<u64>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpResponse {
+    // before the request
+    pub previous_term: u64,
+    pub current_term: u64,
+}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index c53257923a..c1914421ec 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if previously elected leader was me;
-			 * plain restart of walproposer not intervened by concurrent
-			 * compute (who could generate WAL) is ok.
+			 * However, allow to proceed if last_log_term on the node which gave
+			 * the highest vote (i.e. point where we are going to start writing)
+			 * actually had been won by me; plain restart of walproposer not
+			 * intervened by concurrent compute which wrote WAL is ok.
+			 *
+			 * This avoids compute crash after manual term_bump.
 			 */
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
@@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk)
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
-			 * Another compute with higher term is running. Panic to restart
-			 * PG as we likely need to retake basebackup. However, don't dump
-			 * core as this is kinda expected scenario.
+			 *
+			 * Term has changed to higher one, probably another compute is
+			 * running. If this is the case we could PANIC as well because
+			 * likely it inserted some data and our basebackup is unsuitable
+			 * anymore. However, we also bump term manually (term_bump endpoint)
+			 * on safekeepers for migration purposes, in this case we do want
+			 * compute to stay alive. So restart walproposer with FATAL instead
+			 * of panicking; if basebackup is spoiled next election will notice
+			 * this.
 			 */
-			disable_core_dump();
-			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
+			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
 				   sk->appendResponse.term, wp->propTerm);
 		}
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index b8bc3f3e06..c5c9393c00 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,6 +1,9 @@
 use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;
 
+/// If tenant_id is provided, allow if token (claims) is for this tenant or
+/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
+/// SafekeeperData.
 pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
     match (&claims.scope, tenant_id) {
         (Scope::Tenant, None) => Err(AuthError(
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9b7424a818..e482edea55 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
 use utils::http::request::parse_query_param;
 
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::TimelineCreateRequest;
 use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
+use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
 use utils::{
     auth::SwappableJwtAuth,
     http::{
@@ -408,6 +408,28 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
     json_response(StatusCode::OK, response)
 }
 
+/// Make term at least as high as one in request. If one in request is None,
+/// increment current one.
+async fn timeline_term_bump_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let response = tli
+        .term_bump(request_data.term)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -630,6 +652,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
             |r| request_span(r, timeline_backup_partial_reset),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
+            |r| request_span(r, timeline_term_bump_handler),
+        )
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 97eeae3638..8ae749ded5 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,9 +1,10 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).
 
-use std::ops::Deref;
+use std::{cmp::max, ops::Deref};
 
 use anyhow::Result;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -12,7 +13,7 @@ use utils::{
 
 use crate::{
     control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
     wal_backup_partial::{self},
 };
 
@@ -211,6 +212,27 @@ where
         let s = self.start_change();
         self.finish_change(&s).await
     }
+
+    /// Make term at least as `to`. If `to` is None, increment current one. This
+    /// is not in safekeeper.rs because we want to be able to do it even if
+    /// timeline is offloaded.
+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let before = self.acceptor_state.term;
+        let mut state = self.start_change();
+        let new = match to {
+            Some(to) => max(state.acceptor_state.term, to),
+            None => state.acceptor_state.term + 1,
+        };
+        if new > state.acceptor_state.term {
+            state.acceptor_state.term = new;
+            self.finish_change(&state).await?;
+        }
+        let after = self.acceptor_state.term;
+        Ok(TimelineTermBumpResponse {
+            previous_term: before,
+            current_term: after,
+        })
+    }
 }
 
 impl<CTRL> Deref for TimelineState<CTRL>
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 6fd5de0ad6..fb98534768 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,6 +4,7 @@
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -215,6 +216,10 @@ impl StateSK {
             .get_last_log_term(self.flush_lsn())
     }
 
+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        self.state_mut().term_bump(to).await
+    }
+
     /// Close open WAL files to release FDs.
     fn close_wal_store(&mut self) {
         if let StateSK::Loaded(sk) = self {
@@ -854,6 +859,11 @@ impl Timeline {
         Ok(res)
     }
 
+    pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let mut state = self.write_shared_state().await;
+        state.sk.term_bump(to).await
+    }
+
     /// Get the timeline guard for reading/writing WAL files.
     /// If WAL files are not present on disk (evicted), they will be automatically
     /// downloaded from remote storage. This is done in the manager task, which is
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 9bf03554e7..96c84d1616 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -50,6 +50,19 @@ class SafekeeperMetrics(Metrics):
         ).value
 
 
+@dataclass
+class TermBumpResponse:
+    previous_term: int
+    current_term: int
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> "TermBumpResponse":
+        return TermBumpResponse(
+            previous_term=d["previous_term"],
+            current_term=d["current_term"],
+        )
+
+
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
@@ -252,6 +265,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         res.raise_for_status()
         return res.json()
 
+    def term_bump(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        term: Optional[int],
+    ) -> TermBumpResponse:
+        body = {}
+        if term is not None:
+            body["term"] = term
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/term_bump",
+            json=body,
+        )
+        res.raise_for_status()
+        return TermBumpResponse.from_json(res.json())
+
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
         res = self.post(
             f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5672e836ee..50fac441c0 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2194,6 +2194,43 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
     assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
 
 
+def test_term_bump(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    # initialize safekeeper
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    http_cli = env.safekeepers[0].http_client()
+
+    # check that bump up to specific term works
+    curr_term = http_cli.timeline_status(tenant_id, timeline_id).term
+    bump_to = curr_term + 3
+    res = http_cli.term_bump(tenant_id, timeline_id, bump_to)
+    log.info(f"bump to {bump_to} res: {res}")
+    assert res.current_term >= bump_to
+
+    # check that bump to none increments current term
+    res = http_cli.term_bump(tenant_id, timeline_id, None)
+    log.info(f"bump to None res: {res}")
+    assert res.current_term > bump_to
+    assert res.current_term > res.previous_term
+
+    # check that bumping doesn't work downward
+    res = http_cli.term_bump(tenant_id, timeline_id, 2)
+    log.info(f"bump to 2 res: {res}")
+    assert res.current_term > bump_to
+    assert res.current_term == res.previous_term
+
+    # check that this doesn't kill endpoint because last WAL flush was his and
+    # thus its basebackup is still good
+    endpoint.safe_psql("insert into t values (1, 'payload')")
+
+
 # Test disables periodic pushes from safekeeper to the broker and checks that
 # pageserver can still discover safekeepers with discovery requests.
 def test_broker_discovery(neon_env_builder: NeonEnvBuilder):

From 8eab7009c11ebc03f09b2f3916e642664a4b9f88 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 16:54:45 +0300
Subject: [PATCH 106/159] safekeeper: do pid file lock before id init

---
 safekeeper/src/bin/safekeeper.rs | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 41c2d3fe08..644d5e6eaf 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -261,6 +261,15 @@ async fn main() -> anyhow::Result<()> {
     // Change into the data directory.
     std::env::set_current_dir(&workdir)?;
 
+    // Prevent running multiple safekeepers on the same directory
+    let lock_file_path = workdir.join(PID_FILE_NAME);
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("claimed pid file at {lock_file_path:?}");
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    std::mem::forget(lock_file);
+
     // Set or read our ID.
     let id = set_id(&workdir, args.id.map(NodeId))?;
     if args.init {
@@ -364,16 +373,6 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
-    // Prevent running multiple safekeepers on the same directory
-    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file =
-        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
-    info!("claimed pid file at {lock_file_path:?}");
-
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
-    std::mem::forget(lock_file);
-
     info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);

From c1a51416dbfadbf05cc352168ebc6fc4a83c6f59 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 17:40:21 +0300
Subject: [PATCH 107/159] safekeeper: fsync filesystem on start.

We can't really rely on files contents after boot without fsync'ing
them.
---
 libs/utils/src/crashsafe.rs      | 21 +++++++++++++++++++++
 pageserver/src/bin/pageserver.rs | 19 ++-----------------
 safekeeper/src/bin/safekeeper.rs | 12 +++++++++++-
 safekeeper/src/wal_storage.rs    |  3 +--
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 756b19138c..946fedf6e5 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,9 +1,11 @@
+use std::os::fd::AsRawFd;
 use std::{
     borrow::Cow,
     fs::{self, File},
     io::{self, Write},
 };
 
+use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 /// Similar to [`std::fs::create_dir`], except we fsync the
@@ -203,6 +205,25 @@ pub fn overwrite(
     Ok(())
 }
 
+/// Syncs the filesystem for the given file descriptor.
+pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
+    // Linux guarantees durability for syncfs.
+    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+    #[cfg(target_os = "linux")]
+    {
+        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
+    }
+    #[cfg(target_os = "macos")]
+    {
+        // macOS is not a production platform for Neon, don't even bother.
+    }
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+    {
+        compile_error!("Unsupported OS");
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 59194ab4bd..d15a0e47a4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -37,6 +37,7 @@ use pageserver::{
     virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::crashsafe::syncfs;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::{
@@ -155,23 +156,7 @@ fn main() -> anyhow::Result<()> {
         };
 
         let started = Instant::now();
-        // Linux guarantees durability for syncfs.
-        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-        #[cfg(target_os = "linux")]
-        {
-            use std::os::fd::AsRawFd;
-            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
-        }
-        #[cfg(target_os = "macos")]
-        {
-            // macOS is not a production platform for Neon, don't even bother.
-            drop(dirfd);
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            compile_error!("Unsupported OS");
-        }
-
+        syncfs(dirfd)?;
         let elapsed = started.elapsed();
         info!(
             elapsed_ms = elapsed.as_millis(),
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 644d5e6eaf..5270934f5e 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -19,7 +19,7 @@ use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use storage_broker::Uri;
 
 use tracing::*;
@@ -373,6 +373,16 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+    // fsync the datadir to make sure we have a consistent state on disk.
+    let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
+    let started = Instant::now();
+    utils::crashsafe::syncfs(dfd)?;
+    let elapsed = started.elapsed();
+    info!(
+        elapsed_ms = elapsed.as_millis(),
+        "syncfs data directory done"
+    );
+
     info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index c477fe5c7b..46c260901d 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -179,8 +179,7 @@ impl PhysicalStorage {
             )
         };
 
-        // TODO: do we really know that write_lsn is fully flushed to disk?
-        //      If not, maybe it's better to call fsync() here to be sure?
+        // note: this assumes we fsync'ed whole datadir on start.
         let flush_lsn = write_lsn;
 
         debug!(

From 30583cb6264653175d659d0fcb636a42e21c5877 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 6 Sep 2024 17:42:35 +0100
Subject: [PATCH 108/159] CI(label-for-external-users): add retry logic for
 unexpected errors (#8938)

## Problem

One of the PRs opened by a `neondatabase` org member got labelled as
`external` because the `gh api` call failed in the wrong way:
```
Get "https://api.github.com/orgs/neondatabase/members/<username>": dial tcp 140.82.114.5:443: i/o timeout
is-member=false
```

## Summary of changes
- Check that the error message is expected before labelling PRs
- Retry `gh api` call for 10 times in case of unexpected error messages
- Add `workflow_dispatch` trigger
---
 .../workflows/label-for-external-users.yml    | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 585d118dfb..b7cbc06a73 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -7,6 +7,11 @@ on:
   pull_request_target:
     types:
       - opened
+  workflow_dispatch:
+    inputs:
+      github-actor:
+        description: 'GitHub username. If empty, the username of the current user will be used'
+        required: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -26,12 +31,31 @@ jobs:
       id: check-user
       env:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        ACTOR: ${{ inputs.github-actor || github.actor }}
       run: |
-        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
-          is_member=true
-        else
-          is_member=false
-        fi
+        expected_error="User does not exist or is not a member of the organization"
+        output_file=output.txt
+
+        for i in $(seq 1 10); do
+          if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
+
+            is_member=true
+            break
+          elif grep -q "${expected_error}" ${output_file}; then
+            is_member=false
+            break
+          elif [ $i -eq 10 ]; then
+            title="Failed to get memmbership status for ${ACTOR}"
+            message="The latest GitHub API error message: '$(cat ${output_file})'"
+            echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
+
+            exit 1
+          fi
+
+          sleep 1
+        done
 
         echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
 

From ac5815b5940c412a281c6bbab34809689a738da7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 7 Sep 2024 02:14:21 +0800
Subject: [PATCH 109/159] feat(storage-controller): add node shards api (#8896)

For control-plane managed tenants, we have the page in the admin console
that lists all tenants on a specific pageserver. But for
storage-controller managed ones, we don't have that functionality for
now.

## Summary of changes

Adds an API that lists all shards on a given node (intention + observed)

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/storcon_cli/src/main.rs         | 45 +++++++++++++++--
 libs/pageserver_api/src/controller_api.rs     | 15 ++++++
 storage_controller/src/http.rs                | 18 +++++++
 storage_controller/src/service.rs             | 49 +++++++++++++++++--
 test_runner/fixtures/common_types.py          |  8 +++
 test_runner/fixtures/neon_fixtures.py         | 26 +++++++++-
 .../regress/test_storage_controller.py        |  6 +++
 7 files changed, 157 insertions(+), 10 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2a81a3d825..651fcda8db 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -80,7 +80,10 @@ enum Command {
     /// List nodes known to the storage controller
     Nodes {},
     /// List tenants known to the storage controller
-    Tenants {},
+    Tenants {
+        /// If this field is set, it will list the tenants on a specific node
+        node_id: Option<NodeId>,
+    },
     /// Create a new tenant in the storage controller, and by extension on pageservers.
     TenantCreate {
         #[arg(long)]
@@ -403,7 +406,41 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
-        Command::Tenants {} => {
+        Command::Tenants {
+            node_id: Some(node_id),
+        } => {
+            let describe_response = storcon_client
+                .dispatch::<(), NodeShardResponse>(
+                    Method::GET,
+                    format!("control/v1/node/{node_id}/shards"),
+                    None,
+                )
+                .await?;
+            let shards = describe_response.shards;
+            let mut table = comfy_table::Table::new();
+            table.set_header([
+                "Shard",
+                "Intended Primary/Secondary",
+                "Observed Primary/Secondary",
+            ]);
+            for shard in shards {
+                table.add_row([
+                    format!("{}", shard.tenant_shard_id),
+                    match shard.is_intended_secondary {
+                        None => "".to_string(),
+                        Some(true) => "Secondary".to_string(),
+                        Some(false) => "Primary".to_string(),
+                    },
+                    match shard.is_observed_secondary {
+                        None => "".to_string(),
+                        Some(true) => "Secondary".to_string(),
+                        Some(false) => "Primary".to_string(),
+                    },
+                ]);
+            }
+            println!("{table}");
+        }
+        Command::Tenants { node_id: None } => {
             let mut resp = storcon_client
                 .dispatch::<(), Vec<TenantDescribeResponse>>(
                     Method::GET,
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 5c8dcbf571..40b7dbbbc2 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -112,6 +112,21 @@ pub struct TenantDescribeResponse {
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct NodeShardResponse {
+    pub node_id: NodeId,
+    pub shards: Vec<NodeShard>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct NodeShard {
+    pub tenant_shard_id: TenantShardId,
+    /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
+    pub is_observed_secondary: Option<bool>,
+    /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
+    pub is_intended_secondary: Option<bool>,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
     pub id: NodeId,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5d4d0460be..96bdd5039d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -539,6 +539,17 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, node_status)
 }
 
+async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    let node_status = state.service.get_node_shards(node_id).await?;
+
+    json_response(StatusCode::OK, node_status)
+}
+
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1109,6 +1120,13 @@ pub fn make_router(
         .get("/control/v1/node/:node_id", |r| {
             named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
         })
+        .get("/control/v1/node/:node_id/shards", |r| {
+            named_request_span(
+                r,
+                handle_node_shards,
+                RequestName("control_v1_node_describe"),
+            )
+        })
         .get("/control/v1/leader", |r| {
             named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e7eae647df..44fdb474b4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -41,11 +41,11 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
-        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -4924,6 +4924,45 @@ impl Service {
             ))
     }
 
+    pub(crate) async fn get_node_shards(
+        &self,
+        node_id: NodeId,
+    ) -> Result<NodeShardResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+        let mut shards = Vec::new();
+        for (tid, tenant) in locked.tenants.iter() {
+            let is_intended_secondary = match (
+                tenant.intent.get_attached() == &Some(node_id),
+                tenant.intent.get_secondary().contains(&node_id),
+            ) {
+                (true, true) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "{} attached as primary+secondary on the same node",
+                        tid
+                    )))
+                }
+                (true, false) => Some(false),
+                (false, true) => Some(true),
+                (false, false) => None,
+            };
+            let is_observed_secondary = if let Some(ObservedStateLocation { conf: Some(conf) }) =
+                tenant.observed.locations.get(&node_id)
+            {
+                Some(conf.secondary_conf.is_some())
+            } else {
+                None
+            };
+            if is_intended_secondary.is_some() || is_observed_secondary.is_some() {
+                shards.push(NodeShard {
+                    tenant_shard_id: *tid,
+                    is_intended_secondary,
+                    is_observed_secondary,
+                });
+            }
+        }
+        Ok(NodeShardResponse { node_id, shards })
+    }
+
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         self.persistence.get_leader().await
     }
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 8eda19d1e2..064a678c96 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -140,6 +140,14 @@ class TenantId(Id):
         return self.id.hex()
 
 
+class NodeId(Id):
+    def __repr__(self) -> str:
+        return f'`NodeId("{self.id.hex()}")'
+
+    def __str__(self) -> str:
+        return self.id.hex()
+
+
 class TimelineId(Id):
     def __repr__(self) -> str:
         return f'TimelineId("{self.id.hex()}")'
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 18fbbde637..5a600dd0a1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -62,7 +62,7 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
@@ -2570,6 +2570,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return response.json()
 
+    def nodes(self):
+        """
+        :return: list of {"id": ""}
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/node",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def node_shards(self, node_id: NodeId):
+        """
+        :return: list of {"shard_id": "", "is_secondary": bool}
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/node/{node_id}/shards",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def tenant_shard_split(
         self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
     ) -> list[TenantShardId]:
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 92cd74eba5..eea05d7548 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1552,6 +1552,12 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
     literal_shard_count = 1 if shard_count is None else shard_count
     assert len(describe["shards"]) == literal_shard_count
 
+    nodes = env.storage_controller.nodes()
+    assert len(nodes) == 2
+    describe1 = env.storage_controller.node_shards(nodes[0]["id"])
+    describe2 = env.storage_controller.node_shards(nodes[1]["id"])
+    assert len(describe1["shards"]) + len(describe2["shards"]) == literal_shard_count
+
     # Check the data is still there: this implicitly proves that we recovered generation numbers
     # properly, for the timeline which was written to after a generation bump.
     for timeline, branch, expect_rows in [

From fa3fc73c1b3366a3316456bcb8fdce1bed159200 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 21:05:18 +0200
Subject: [PATCH 110/159] Address 1.82 clippy lints (#8944)

Addresses the clippy lints of the beta 1.82 toolchain.

The `too_long_first_doc_paragraph` lint complained a lot and was
addressed separately: #8941
---
 Dockerfile.build-tools            |  2 +-
 libs/utils/src/logging.rs         |  2 +-
 rust-toolchain.toml               |  4 ++--
 scripts/coverage                  |  4 ++--
 storage_controller/src/service.rs | 10 +++++-----
 workspace_hack/Cargo.toml         |  2 ++
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a9cbed85fb..c4209c7a12 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     export PATH="$HOME/.cargo/bin:$PATH" && \
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
-    rustup component add llvm-tools-preview rustfmt clippy && \
+    rustup component add llvm-tools rustfmt clippy && \
     cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
     cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 71af43a4da..2ea0781667 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -190,7 +190,7 @@ impl Drop for TracingPanicHookGuard {
 }
 
 /// Named symbol for our panic hook, which logs the panic.
-fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
     // following rust 1.66.1 std implementation:
     // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
     let location = info.location();
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index e78c4d6790..3c5d0b12a6 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -3,5 +3,5 @@ channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
-# but we also need `llvm-tools-preview` for coverage data merges on CI
-components = ["llvm-tools-preview", "rustfmt", "clippy"]
+# but we also need `llvm-tools` for coverage data merges on CI
+components = ["llvm-tools", "rustfmt", "clippy"]
diff --git a/scripts/coverage b/scripts/coverage
index 52a69c93b9..482dc58ff6 100755
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -134,7 +134,7 @@ class LLVM:
             # Show a user-friendly warning
             raise Exception(' '.join([
                 f"It appears that you don't have `{name}` installed.",
-                "Please execute `rustup component add llvm-tools-preview`,",
+                "Please execute `rustup component add llvm-tools`,",
                 "or install it via your package manager of choice.",
                 "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
             ]))
@@ -518,7 +518,7 @@ def main() -> None:
     example = f"""
 prerequisites:
     # alternatively, install a system package for `llvm-tools`
-    rustup component add llvm-tools-preview
+    rustup component add llvm-tools
 
 self-contained example:
     {app} run make
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 44fdb474b4..6365423e10 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -451,7 +451,7 @@ struct ShardSplitParams {
 // When preparing for a shard split, we may either choose to proceed with the split,
 // or find that the work is already done and return NoOp.
 enum ShardSplitAction {
-    Split(ShardSplitParams),
+    Split(Box<ShardSplitParams>),
     NoOp(TenantShardSplitResponse),
 }
 
@@ -4186,7 +4186,7 @@ impl Service {
         let policy = policy.unwrap();
         let config = config.unwrap();
 
-        Ok(ShardSplitAction::Split(ShardSplitParams {
+        Ok(ShardSplitAction::Split(Box::new(ShardSplitParams {
             old_shard_count,
             new_shard_count: ShardCount::new(split_req.new_shard_count),
             new_stripe_size: split_req.new_stripe_size,
@@ -4194,13 +4194,13 @@ impl Service {
             policy,
             config,
             shard_ident,
-        }))
+        })))
     }
 
     async fn do_tenant_shard_split(
         &self,
         tenant_id: TenantId,
-        params: ShardSplitParams,
+        params: Box<ShardSplitParams>,
     ) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
@@ -4216,7 +4216,7 @@ impl Service {
             policy,
             config,
             shard_ident,
-        } = params;
+        } = *params;
 
         // Drop any secondary locations: pageservers do not support splitting these, and in any case the
         // end-state for a split tenant will usually be to have secondary locations on different nodes.
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 3d2fa8c214..94f4c0f22f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -8,6 +8,8 @@ version = "0.1.0"
 description = "workspace-hack package, managed by hakari"
 # You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing.
 publish = false
+edition.workspace = true
+license.workspace = true
 # The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments
 # are managed by hakari.
 

From 3dbd34aa78258928344d4de80ddcdcf46b35dfbc Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 7 Sep 2024 00:42:55 +0300
Subject: [PATCH 111/159] feat(storcon): forward gc blocking and unblocking
 (#8956)

Currently using gc blocking and unblocking with storage controller
managed pageservers is painful. Implement the API on storage controller.

Fixes: #8893
---
 pageserver/client/src/lib.rs                  | 18 ++++
 pageserver/client/src/mgmt_api.rs             | 16 ++++
 storage_controller/src/http.rs                | 39 +++++++-
 storage_controller/src/pageserver_client.rs   | 23 ++++-
 storage_controller/src/service.rs             | 54 +++++++++++-
 .../regress/test_timeline_gc_blocking.py      | 88 +++++++++++++++----
 6 files changed, 220 insertions(+), 18 deletions(-)

diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
index 4a3f4dea47..cc8db37173 100644
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1,2 +1,20 @@
 pub mod mgmt_api;
 pub mod page_service;
+
+/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
+// If file structure is per-kind not per-feature then where to put this?
+#[derive(Clone, Copy)]
+pub enum BlockUnblock {
+    Block,
+    Unblock,
+}
+
+impl std::fmt::Display for BlockUnblock {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            BlockUnblock::Block => "block",
+            BlockUnblock::Unblock => "unblock",
+        };
+        f.write_str(s)
+    }
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 737cb00835..a68f45a6d9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -12,6 +12,8 @@ use utils::{
 
 pub use reqwest::Body as ReqwestBody;
 
+use crate::BlockUnblock;
+
 pub mod util;
 
 #[derive(Debug, Clone)]
@@ -454,6 +456,20 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn timeline_block_unblock_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request(Method::POST, &uri, ()).await.map(|_| ())
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 96bdd5039d..a6638f5191 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -21,7 +21,7 @@ use pageserver_api::models::{
     TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
-use pageserver_client::mgmt_api;
+use pageserver_client::{mgmt_api, BlockUnblock};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -369,6 +369,23 @@ async fn handle_tenant_timeline_detach_ancestor(
     json_response(StatusCode::OK, res)
 }
 
+async fn handle_tenant_timeline_block_unblock_gc(
+    service: Arc<Service>,
+    req: Request<Body>,
+    dir: BlockUnblock,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    service
+        .tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1292,6 +1309,26 @@ pub fn make_router(
                 )
             },
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block),
+                    RequestName("v1_tenant_timeline_block_unblock_gc"),
+                )
+            },
+        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock),
+                    RequestName("v1_tenant_timeline_block_unblock_gc"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 20770ed703..961a1f78dd 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -7,7 +7,10 @@ use pageserver_api::{
     },
     shard::TenantShardId,
 };
-use pageserver_client::mgmt_api::{Client, Result};
+use pageserver_client::{
+    mgmt_api::{Client, Result},
+    BlockUnblock,
+};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -258,6 +261,24 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_block_unblock_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<()> {
+        // measuring these makes no sense because we synchronize with the gc loop and remote
+        // storage on block_gc so there should be huge outliers
+        measured_request!(
+            "timeline_block_unblock_gc",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
+                .await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
         measured_request!(
             "utilization",
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6365423e10..be3efaf688 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -69,7 +69,7 @@ use pageserver_api::{
         ValidateResponse, ValidateResponseTenant,
     },
 };
-use pageserver_client::mgmt_api;
+use pageserver_client::{mgmt_api, BlockUnblock};
 use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
 use utils::{
@@ -142,6 +142,7 @@ enum TenantOperations {
     AttachHook,
     TimelineArchivalConfig,
     TimelineDetachAncestor,
+    TimelineGcBlockUnblock,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -3197,6 +3198,57 @@ impl Service {
         }).await?
     }
 
+    pub(crate) async fn tenant_timeline_block_unblock_gc(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineGcBlockUnblock,
+        )
+        .await;
+
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            }
+
+            async fn do_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+                dir: BlockUnblock,
+            ) -> Result<(), ApiError> {
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
+                client
+                    .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
+                    .await
+                    .map_err(|e| passthrough_api_error(&node, e))
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            self.tenant_for_shards(targets, |tenant_shard_id, node| {
+                futures::FutureExt::boxed(do_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                    dir,
+                ))
+            })
+            .await
+        })
+        .await??;
+        Ok(())
+    }
+
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
     ///
     /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index 24de894687..ddfe9b911f 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -1,17 +1,32 @@
 import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import List, Optional
 
+import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LogCursor,
     NeonEnvBuilder,
+    NeonPageserver,
 )
 from fixtures.pageserver.utils import wait_timeline_detail_404
 
 
-def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("sharded", [True, False])
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    neon_env_builder.num_pageservers = 2 if sharded else 1
     env = neon_env_builder.init_start(
-        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"},
+        initial_tenant_shard_count=2 if sharded else None,
     )
-    ps = env.pageserver
-    http = ps.http_client()
+
+    if sharded:
+        http = env.storage_controller.pageserver_api()
+    else:
+        http = env.pageserver.http_client()
+
+    pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers)))
 
     foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
 
@@ -22,9 +37,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
     tenant_before = http.tenant_status(env.initial_tenant)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line)
-
-    assert ps.log_contains(gc_skipped_line, offset) is None
+    pss.assert_log_contains(gc_active_line)
+    pss.assert_log_does_not_contain(gc_skipped_line)
 
     http.timeline_block_gc(env.initial_tenant, foo_branch)
 
@@ -34,34 +48,78 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
     assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
-    ps.restart()
-    ps.quiesce_tenants()
+    pss.restart()
+    pss.quiesce_tenants()
 
-    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+    pss.assert_log_contains(init_gc_skipped)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
     # deletion unblocks gc
     http.timeline_delete(env.initial_tenant, foo_branch)
     wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)
 
     http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
     # removing the manual block also unblocks gc
     http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)
 
 
 def wait_for_another_gc_round():
     time.sleep(2)
+
+
+@dataclass
+class ScrollableLog:
+    pageserver: NeonPageserver
+    offset: Optional[LogCursor]
+
+    def assert_log_contains(self, what: str):
+        msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset)
+        old = self.offset
+        self.offset = offset
+        log.info(f"{old} -> {offset}: {msg}")
+
+    def assert_log_does_not_contain(self, what: str):
+        assert self.pageserver.log_contains(what) is None
+
+
+@dataclass(frozen=True)
+class ManyPageservers:
+    many: List[ScrollableLog]
+
+    def assert_log_contains(self, what: str):
+        for one in self.many:
+            one.assert_log_contains(what)
+
+    def assert_log_does_not_contain(self, what: str):
+        for one in self.many:
+            one.assert_log_does_not_contain(what)
+
+    def restart(self):
+        def do_restart(x: ScrollableLog):
+            x.pageserver.restart()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_restart, self.many)
+            rt.shutdown(wait=True)
+
+    def quiesce_tenants(self):
+        def do_quiesce(x: ScrollableLog):
+            x.pageserver.quiesce_tenants()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_quiesce, self.many)
+            rt.shutdown(wait=True)

From 16c200d6d9f0eaade2efe4ad0f649c3bbf23bf08 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Sat, 7 Sep 2024 00:20:36 +0100
Subject: [PATCH 112/159] push images to prod ACR (#8940)

Used `vars` for new storing non-sensitive information, changed dev
secrets to vars as well but
didn't cleanup any secrets.

https://github.com/neondatabase/cloud/issues/16925

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/actionlint.yml               |  7 ++++
 .github/workflows/_push-to-acr.yml   | 56 ++++++++++++++++++++++++++++
 .github/workflows/build_and_test.yml | 53 +++++++++++++-------------
 3 files changed, 89 insertions(+), 27 deletions(-)
 create mode 100644 .github/workflows/_push-to-acr.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 4ad8a7b460..1b602883c5 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,6 +7,13 @@ self-hosted-runner:
     - small-arm64
     - us-east-2
 config-variables:
+  - AZURE_DEV_CLIENT_ID
+  - AZURE_DEV_REGISTRY_NAME
+  - AZURE_DEV_SUBSCRIPTION_ID
+  - AZURE_PROD_CLIENT_ID
+  - AZURE_PROD_REGISTRY_NAME
+  - AZURE_PROD_SUBSCRIPTION_ID
+  - AZURE_TENANT_ID
   - BENCHMARK_PROJECT_ID_PUB
   - BENCHMARK_PROJECT_ID_SUB
   - REMOTE_STORAGE_AZURE_CONTAINER
diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
new file mode 100644
index 0000000000..415b3d9cc6
--- /dev/null
+++ b/.github/workflows/_push-to-acr.yml
@@ -0,0 +1,56 @@
+name: Push images to ACR
+on:
+  workflow_call:
+    inputs:
+      client_id:
+        description: Client ID of Azure managed identity or Entra app
+        required: true
+        type: string
+      image_tag:
+        description: Tag for the container image
+        required: true
+        type: string
+      images:
+        description: Images to push
+        required: true
+        type: string
+      registry_name:
+        description: Name of the container registry
+        required: true
+        type: string
+      subscription_id:
+        description: Azure subscription ID
+        required: true
+        type: string
+      tenant_id:
+        description: Azure tenant ID
+        required: true
+        type: string
+
+jobs:
+  push-to-acr:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
+
+    steps:
+      - name: Azure login
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ inputs.client_id }}
+          subscription-id: ${{ inputs.subscription_id }}
+          tenant-id: ${{ inputs.tenant_id }}
+
+      - name: Login to ACR
+        run: |
+          az acr login --name=${{ inputs.registry_name }}
+
+      - name: Copy docker images to ACR ${{ inputs.registry_name }}
+        run: |
+          images='${{ inputs.images }}'
+          for image in ${images}; do
+            docker buildx imagetools create \
+              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
+                                        neondatabase/${image}:${{ inputs.image_tag }}
+          done
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ee5fd1b0c6..4bb9e5cb66 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,9 +794,6 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images:
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
@@ -823,28 +820,6 @@ jobs:
                                                neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
           done
 
-      - name: Azure login
-        if: github.ref_name == 'main'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: github.ref_name == 'main'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Copy docker images to ACR-dev
-        if: github.ref_name == 'main'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create \
-              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
-                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
       - name: Add latest tag to images
         if: github.ref_name == 'main'
         run: |
@@ -882,6 +857,30 @@ jobs:
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
 
+  push-to-acr-dev:
+    if: github.ref_name == 'main'
+    needs: [ tag, promote-images ]
+    uses: ./.github/workflows/_push-to-acr.yml
+    with:
+      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      image_tag: ${{ needs.tag.outputs.build-tag }}
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      tenant_id: ${{ vars.AZURE_TENANT_ID }}
+
+  push-to-acr-prod:
+    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    needs: [ tag, promote-images ]
+    uses: ./.github/workflows/_push-to-acr.yml
+    with:
+      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
+      image_tag: ${{ needs.tag.outputs.build-tag }}
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
+      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
+      tenant_id: ${{ vars.AZURE_TENANT_ID }}
+
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
     runs-on: ubuntu-22.04
@@ -957,8 +956,8 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
-    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest

From 7d7d1f354b127ae27ae2e76cb0b3e9a3c8f69d90 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 7 Sep 2024 08:17:25 +0100
Subject: [PATCH 113/159] Fix rust warnings on macOS (#8955)

## Problem
```
error: unused import: `anyhow::Context`
 --> libs/utils/src/crashsafe.rs:8:5
  |
8 | use anyhow::Context;
  |     ^^^^^^^^^^^^^^^
  |
  = note: `-D unused-imports` implied by `-D warnings`
  = help: to override `-D warnings` add `#[allow(unused_imports)]`

error: unused variable: `fd`
   --> libs/utils/src/crashsafe.rs:209:15
    |
209 | pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
    |               ^^ help: if this is intentional, prefix it with an underscore: `_fd`
    |
    = note: `-D unused-variables` implied by `-D warnings`
    = help: to override `-D warnings` add `#[allow(unused_variables)]`
```

## Summary of changes
- Fix rust warnings on macOS
---
 libs/utils/src/crashsafe.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 946fedf6e5..b97c6c7a45 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -5,7 +5,6 @@ use std::{
     io::{self, Write},
 };
 
-use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 /// Similar to [`std::fs::create_dir`], except we fsync the
@@ -206,11 +205,13 @@ pub fn overwrite(
 }
 
 /// Syncs the filesystem for the given file descriptor.
+#[cfg_attr(target_os = "macos", allow(unused_variables))]
 pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
     // Linux guarantees durability for syncfs.
     // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
     #[cfg(target_os = "linux")]
     {
+        use anyhow::Context;
         nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
     }
     #[cfg(target_os = "macos")]

From 93ec7503e08f126e180579f02dcdb6e7a95724ba Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 7 Sep 2024 16:11:36 +0300
Subject: [PATCH 114/159] Lock the correct revision of rust-postgres crates
 (#8960)

We modified the crate in an incompatible way and upgraded to the new
version in PR #8076. However, it was reverted in #8654. The revert
reverted the Cargo.lock reference to it, but since Cargo.toml still
points to the (tip of the) 'neon' branch, every time you make any other
unrelated changes to Cargo.toml, it also tries to update the
rust-postgres crates to the tip of the 'neon' branch again, which
doesn't work.

To fix, lock the crates to the exact commit SHA that works.
---
 Cargo.lock |  8 ++++----
 Cargo.toml | 21 ++++++++++++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 634af67198..cf3031c6d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4121,7 +4121,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4134,7 +4134,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4153,7 +4153,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6409,7 +6409,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 5045ee0d4d..9203920971 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,10 +201,21 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+
+# We want to use the 'neon' branch for these, but there's currently one
+# incompatible change on the branch. See:
+#
+# - PR #8076 which contained changes that depended on the new changes in
+#   the rust-postgres crate, and
+# - PR #8654 which reverted those changes and made the code in proxy incompatible
+#   with the tip of the 'neon' branch again.
+#
+# When those proxy changes are re-applied (see PR #8747), we can switch using
+# the tip of the 'neon' branch again.
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -241,7 +252,7 @@ tonic-build = "0.9"
 [patch.crates-io]
 
 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
 # bug fixes for UUID
 parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }

From 89c5e80b3ff55f0f316aebca0bba497eba7fbec8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:47:23 +0300
Subject: [PATCH 115/159] Update toml and toml_edit crates (#8963)

Eliminates a few duplicate versions from the dependency tree.
---
 Cargo.lock                        | 57 +++++++------------------------
 Cargo.toml                        |  4 +--
 control_plane/src/pageserver.rs   | 10 +++---
 libs/remote_storage/src/config.rs |  2 +-
 libs/utils/src/toml_edit_ext.rs   |  2 +-
 pageserver/ctl/src/main.rs        |  2 +-
 pageserver/src/tenant/config.rs   |  3 +-
 workspace_hack/Cargo.toml         |  2 ++
 8 files changed, 26 insertions(+), 56 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cf3031c6d0..30c9f7e080 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1246,7 +1246,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -1360,8 +1360,8 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-util",
- "toml 0.7.4",
- "toml_edit 0.19.10",
+ "toml",
+ "toml_edit",
  "tracing",
  "url",
  "utils",
@@ -3144,7 +3144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
 dependencies = [
  "serde",
- "toml 0.8.14",
+ "toml",
 ]
 
 [[package]]
@@ -3660,7 +3660,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "utils",
  "workspace_hack",
 ]
@@ -3747,7 +3747,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "twox-hash",
  "url",
@@ -4812,7 +4812,7 @@ dependencies = [
  "tokio",
  "tokio-stream",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "utils",
 ]
@@ -5322,7 +5322,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -6520,18 +6520,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "toml"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
-dependencies = [
- "serde",
- "serde_spanned",
- "toml_datetime",
- "toml_edit 0.19.10",
-]
-
 [[package]]
 name = "toml"
 version = "0.8.14"
@@ -6541,7 +6529,7 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "toml_edit 0.22.14",
+ "toml_edit",
 ]
 
 [[package]]
@@ -6553,19 +6541,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "toml_edit"
-version = "0.19.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
-dependencies = [
- "indexmap 1.9.3",
- "serde",
- "serde_spanned",
- "toml_datetime",
- "winnow 0.4.6",
-]
-
 [[package]]
 name = "toml_edit"
 version = "0.22.14"
@@ -6576,7 +6551,7 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "winnow 0.6.13",
+ "winnow",
 ]
 
 [[package]]
@@ -6989,7 +6964,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
@@ -7535,15 +7510,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
 
-[[package]]
-name = "winnow"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "winnow"
 version = "0.6.13"
@@ -7651,6 +7617,7 @@ dependencies = [
  "tokio",
  "tokio-rustls 0.24.0",
  "tokio-util",
+ "toml_edit",
  "tonic",
  "tower",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index 9203920971..107cd6cd44 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -177,8 +177,8 @@ tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
-toml = "0.7"
-toml_edit = "0.19"
+toml = "0.8"
+toml_edit = "0.22"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 31777eb7a5..33ca70af96 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -75,14 +75,14 @@ impl PageServerNode {
         }
     }
 
-    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
-        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
+        toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
     }
 
     fn pageserver_init_make_toml(
         &self,
         conf: NeonLocalInitPageserverConf,
-    ) -> anyhow::Result<toml_edit::Document> {
+    ) -> anyhow::Result<toml_edit::DocumentMut> {
         assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
 
         // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
@@ -137,9 +137,9 @@ impl PageServerNode {
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
-        let mut config_toml = toml_edit::Document::new();
+        let mut config_toml = toml_edit::DocumentMut::new();
         for fragment_str in overrides {
-            let fragment = toml_edit::Document::from_str(&fragment_str)
+            let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
                 .expect("all fragments in `overrides` are valid toml documents, this function controls that");
             for (key, item) in fragment.iter() {
                 config_toml.insert(key, item.clone());
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index f819a1572a..d0e92411da 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -185,7 +185,7 @@ mod tests {
     use super::*;
 
     fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
-        let toml = input.parse::<toml_edit::Document>().unwrap();
+        let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
         RemoteStorageConfig::from_toml(toml.as_item())
     }
 
diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs
index ab5f7bdd95..1359e27b77 100644
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
 where
     T: serde::de::DeserializeOwned,
 {
-    let document: toml_edit::Document = match item {
+    let document: toml_edit::DocumentMut = match item {
         toml_edit::Item::Table(toml) => toml.clone().into(),
         toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
             toml.clone().into_table().into()
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3b66b0c4aa..cf001ef0d5 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
                 println!("specified prefix '{}' failed validation", cmd.prefix);
                 return Ok(());
             };
-            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
+            let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
             let toml_item = toml_document
                 .get("remote_storage")
                 .expect("need remote_storage");
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 7e0344666b..547b43a399 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -452,7 +452,8 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
                     .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
             }
             toml_edit::Item::Table(table) => {
-                let deserializer = toml_edit::de::Deserializer::new(table.into());
+                let deserializer =
+                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
                 return serde_path_to_error::deserialize(deserializer)
                     .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
             }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 94f4c0f22f..411ca81032 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -83,6 +83,7 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
+toml_edit = { version = "0.22", features = ["serde"] }
 tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
@@ -126,6 +127,7 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+toml_edit = { version = "0.22", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 2d885ac07ae0207ab886fd4dda84701ae33893f1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:47:57 +0300
Subject: [PATCH 116/159] Update strum (#8962)

I wanted to use some features from the newer version. The PR that needed
the new version is not ready yet (and might never be), but seems nice to
stay up in any case.
---
 Cargo.lock                        | 40 ++++++++++---------------------
 Cargo.toml                        |  6 ++---
 libs/pageserver_api/src/models.rs |  2 +-
 libs/utils/src/logging.rs         |  6 ++---
 pageserver/src/metrics.rs         |  4 ++--
 5 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 30c9f7e080..4fb3ac7223 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1189,9 +1189,9 @@ dependencies = [
 
 [[package]]
 name = "comfy-table"
-version = "6.1.4"
+version = "7.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
+checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
 dependencies = [
  "crossterm",
  "strum",
@@ -1485,25 +1485,22 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
 
 [[package]]
 name = "crossterm"
-version = "0.25.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
+checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "crossterm_winapi",
  "libc",
- "mio",
  "parking_lot 0.12.1",
- "signal-hook",
- "signal-hook-mio",
  "winapi",
 ]
 
 [[package]]
 name = "crossterm_winapi"
-version = "0.9.0"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
+checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
 dependencies = [
  "winapi",
 ]
@@ -5731,17 +5728,6 @@ dependencies = [
  "signal-hook-registry",
 ]
 
-[[package]]
-name = "signal-hook-mio"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
-dependencies = [
- "libc",
- "mio",
- "signal-hook",
-]
-
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.1"
@@ -6054,21 +6040,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
 [[package]]
 name = "strum"
-version = "0.24.1"
+version = "0.26.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
 
 [[package]]
 name = "strum_macros"
-version = "0.24.3"
+version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck 0.4.1",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 107cd6cd44..40e399619d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -73,7 +73,7 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-comfy-table = "6.1"
+comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-deque = "0.8.5"
@@ -158,8 +158,8 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
-strum = "0.24"
-strum_macros = "0.24"
+strum = "0.26"
+strum_macros = "0.26"
 "subtle"  = "2.5.0"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ffe79c8350..45e84baa1f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
     serde::Serialize,
     serde::Deserialize,
     strum_macros::Display,
-    strum_macros::EnumVariantNames,
+    strum_macros::VariantNames,
     strum_macros::AsRefStr,
     strum_macros::IntoStaticStr,
 )]
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 2ea0781667..e205d60d74 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -3,11 +3,9 @@ use std::str::FromStr;
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use strum_macros::{EnumString, EnumVariantNames};
+use strum_macros::{EnumString, VariantNames};
 
-#[derive(
-    EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
-)]
+#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
     Plain,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c4011d593c..9197505876 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,7 +9,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, VariantNames};
-use strum_macros::{EnumVariantNames, IntoStaticStr};
+use strum_macros::{IntoStaticStr, VariantNames};
 use tracing::warn;
 use utils::id::TimelineId;
 
@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];
 
 // Metrics collected on operations on the storage repository.
-#[derive(Debug, EnumVariantNames, IntoStaticStr)]
+#[derive(Debug, VariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]

From c8f67eed8f0e3ed182ebe85753389ae5b1c161ea Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 9 Sep 2024 10:34:56 +0300
Subject: [PATCH 117/159] Remove TEST_SHARED_FIXTURES (#8965)

I wish it worked, but it's been broken for a long time, so let's admit
defeat and remove it.

The idea of sharing the same pageserver and safekeeper environment
between tests is still sound, and it could save a lot of time in our
CI. We should perhaps put some time into doing that, but we're better
off starting from scratch than trying to make TEST_SHARED_FIXTURES
work in its current form.
---
 test_runner/README.md                 | 12 ++---
 test_runner/fixtures/neon_fixtures.py | 68 +++------------------------
 2 files changed, 10 insertions(+), 70 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index 73aa29d4bb..647b930b26 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -18,8 +18,7 @@ Prerequisites:
 
 Regression tests are in the 'regress' directory. They can be run in
 parallel to minimize total runtime. Most regression test sets up their
-environment with its own pageservers and safekeepers (but see
-`TEST_SHARED_FIXTURES`).
+environment with its own pageservers and safekeepers.
 
 'pg_clients' contains tests for connecting with various client
 libraries. Each client test uses a Dockerfile that pulls an image that
@@ -74,7 +73,6 @@ This is used to construct full path to the postgres binaries.
 Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
-`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
 `RUST_LOG`: logging configuration to pass into Neon CLI
 
 Useful parameters and commands:
@@ -259,11 +257,9 @@ compute Postgres nodes. The connections between them can be configured to use JW
 authentication tokens, and some other configuration options can be tweaked too.
 
 The easiest way to get access to a Neon Environment is by using the `neon_simple_env`
-fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
-or make other destructive changes in that environment. Also don't assume that
-there are no tenants or branches or data in the cluster. For convenience, there is a
-branch called `empty`, though. The convention is to create a test-specific branch of
-that and load any test data there, instead of the 'main' branch.
+fixture. For convenience, there is a branch called `empty` in environments created with
+'neon_simple_env'. The convention is to create a test-specific branch of that and load any
+test data there, instead of the 'main' branch.
 
 For more complicated cases, you can build a custom Neon Environment, with the `neon_env`
 fixture:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5a600dd0a1..3047dcc4f7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -221,33 +221,6 @@ def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI:
     return NeonAPI(neon_api_key, neon_api_base_url)
 
 
-def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
-    """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
-
-    This function can be used as a scope like this:
-    @pytest.fixture(scope=shareable_scope)
-    def myfixture(...)
-       ...
-    """
-    scope: Literal["session", "function"]
-
-    if os.environ.get("TEST_SHARED_FIXTURES") is None:
-        # Create the environment in the per-test output directory
-        scope = "function"
-    elif (
-        os.environ.get("BUILD_TYPE") is not None
-        and os.environ.get("DEFAULT_PG_VERSION") is not None
-    ):
-        scope = "session"
-    else:
-        pytest.fail(
-            "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set",
-            pytrace=False,
-        )
-
-    return scope
-
-
 @pytest.fixture(scope="session")
 def worker_port_num():
     return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
@@ -1431,8 +1404,8 @@ class NeonEnv:
         return "ep-" + str(self.endpoint_counter)
 
 
-@pytest.fixture(scope=shareable_scope)
-def _shared_simple_env(
+@pytest.fixture(scope="function")
+def neon_simple_env(
     request: FixtureRequest,
     pytestconfig: Config,
     port_distributor: PortDistributor,
@@ -1450,19 +1423,13 @@ def _shared_simple_env(
     pageserver_io_buffer_alignment: Optional[int],
 ) -> Iterator[NeonEnv]:
     """
-    # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
-     is set, this is shared by all tests using `neon_simple_env`.
+    Simple Neon environment, with no authentication and no safekeepers.
 
     This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
     """
 
-    if os.environ.get("TEST_SHARED_FIXTURES") is None:
-        # Create the environment in the per-test output directory
-        repo_dir = get_test_repo_dir(request, top_output_dir)
-    else:
-        # We're running shared fixtures. Share a single directory.
-        repo_dir = top_output_dir / "shared_repo"
-        shutil.rmtree(repo_dir, ignore_errors=True)
+    # Create the environment in the per-test output directory
+    repo_dir = get_test_repo_dir(request, top_output_dir)
 
     with NeonEnvBuilder(
         top_output_dir=top_output_dir,
@@ -1489,22 +1456,6 @@ def _shared_simple_env(
 
         yield env
 
-
-@pytest.fixture(scope="function")
-def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
-    """
-    Simple Neon environment, with no authentication and no safekeepers.
-
-    If TEST_SHARED_FIXTURES environment variable is set, we reuse the same
-    environment for all tests that use 'neon_simple_env', keeping the
-    page server and safekeepers running. Any compute nodes are stopped after
-    each the test, however.
-    """
-    yield _shared_simple_env
-
-    _shared_simple_env.endpoints.stop_all()
-
-
 @pytest.fixture(scope="function")
 def neon_env_builder(
     pytestconfig: Config,
@@ -4898,14 +4849,7 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
 
 
 # This is autouse, so the test output directory always gets created, even
-# if a test doesn't put anything there. It also solves a problem with the
-# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it
-# creates the repo in the test output directory. But it cannot depend on
-# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set,
-# it has 'session' scope and cannot access fixtures with 'function'
-# scope. So it uses the get_test_output_dir() function to get the path, and
-# this fixture ensures that the directory exists.  That works because
-# 'autouse' fixtures are run before other fixtures.
+# if a test doesn't put anything there.
 #
 # NB: we request the overlay dir fixture so the fixture does its cleanups
 @pytest.fixture(scope="function", autouse=True)

From 723c0971e818848696984fd66c562c9d0cbff948 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 9 Sep 2024 10:35:12 +0300
Subject: [PATCH 118/159] Don't create 'empty' branch in neon_simple_env
 (#8965)

Now that we've given up hope on sharing the neon_simple_env between
tests, there's no reason to not use the 'main' branch directly.
---
 test_runner/README.md                         |  5 ++--
 test_runner/fixtures/neon_fixtures.py         |  5 +---
 .../performance/test_logical_replication.py   |  4 +--
 test_runner/regress/test_basebackup_error.py  |  3 +--
 test_runner/regress/test_clog_truncate.py     |  5 ++--
 test_runner/regress/test_compute_catalog.py   |  3 +--
 test_runner/regress/test_config.py            |  3 +--
 test_runner/regress/test_createdropdb.py      | 17 ++++---------
 test_runner/regress/test_createuser.py        |  5 ++--
 test_runner/regress/test_ddl_forwarding.py    |  3 +--
 .../regress/test_explain_with_lfc_stats.py    |  6 ++---
 test_runner/regress/test_lfc_resize.py        |  3 +--
 .../test_lfc_working_set_approximation.py     |  6 ++---
 test_runner/regress/test_local_file_cache.py  |  7 ++----
 .../regress/test_logical_replication.py       | 11 +++-----
 test_runner/regress/test_migrations.py        |  3 +--
 test_runner/regress/test_multixact.py         |  7 +++---
 test_runner/regress/test_neon_superuser.py    |  2 +-
 test_runner/regress/test_parallel_copy.py     |  3 +--
 .../regress/test_pg_query_cancellation.py     |  4 +--
 test_runner/regress/test_pg_waldump.py        |  4 +--
 test_runner/regress/test_read_validation.py   | 12 ++-------
 test_runner/regress/test_readonly_node.py     | 25 ++++++++-----------
 test_runner/regress/test_subxacts.py          |  3 +--
 test_runner/regress/test_timeline_delete.py   |  7 ++++--
 test_runner/regress/test_timeline_size.py     |  4 +--
 test_runner/regress/test_twophase.py          |  7 ++----
 test_runner/regress/test_unlogged.py          |  5 ++--
 test_runner/regress/test_vm_bits.py           |  5 ++--
 test_runner/test_broken.py                    |  3 +--
 30 files changed, 65 insertions(+), 115 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index 647b930b26..d754e60d17 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -257,9 +257,8 @@ compute Postgres nodes. The connections between them can be configured to use JW
 authentication tokens, and some other configuration options can be tweaked too.
 
 The easiest way to get access to a Neon Environment is by using the `neon_simple_env`
-fixture. For convenience, there is a branch called `empty` in environments created with
-'neon_simple_env'. The convention is to create a test-specific branch of that and load any
-test data there, instead of the 'main' branch.
+fixture. For convenience, there is a branch called `main` in environments created with
+'neon_simple_env', ready to be used in the test.
 
 For more complicated cases, you can build a custom Neon Environment, with the `neon_env`
 fixture:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3047dcc4f7..60887b9aed 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -57,7 +57,6 @@ from _pytest.fixtures import FixtureRequest
 from psycopg2.extensions import connection as PgConnection
 from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
-from typing_extensions import Literal
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
@@ -1451,11 +1450,9 @@ def neon_simple_env(
     ) as builder:
         env = builder.init_start()
 
-        # For convenience in tests, create a branch from the freshly-initialized cluster.
-        env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
-
         yield env
 
+
 @pytest.fixture(scope="function")
 def neon_env_builder(
     pytestconfig: Config,
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 077f73ac06..29a0380524 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -22,10 +22,8 @@ if TYPE_CHECKING:
 def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start("test_logical_replication")
+    endpoint = env.endpoints.create_start("main")
 
-    log.info("postgres is running on 'test_logical_replication' branch")
     pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
 
     endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py
index 170b494884..13c080ea0e 100644
--- a/test_runner/regress/test_basebackup_error.py
+++ b/test_runner/regress/test_basebackup_error.py
@@ -8,11 +8,10 @@ from fixtures.neon_fixtures import NeonEnv
 #
 def test_basebackup_error(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_basebackup_error", "empty")
     pageserver_http = env.pageserver.http_client()
 
     # Introduce failpoint
     pageserver_http.configure_failpoints(("basebackup-before-control-file", "return"))
 
     with pytest.raises(Exception, match="basebackup-before-control-file"):
-        env.endpoints.create_start("test_basebackup_error")
+        env.endpoints.create_start("main")
diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py
index 26e6e336b9..6e4880841a 100644
--- a/test_runner/regress/test_clog_truncate.py
+++ b/test_runner/regress/test_clog_truncate.py
@@ -11,7 +11,6 @@ from fixtures.utils import query_scalar
 #
 def test_clog_truncate(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_clog_truncate", "empty")
 
     # set aggressive autovacuum to make sure that truncation will happen
     config = [
@@ -24,7 +23,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
         "autovacuum_freeze_max_age=100000",
     ]
 
-    endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config)
+    endpoint = env.endpoints.create_start("main", config_lines=config)
 
     # Install extension containing function needed for test
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
@@ -58,7 +57,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     # create new branch after clog truncation and start a compute node on it
     log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}")
     env.neon_cli.create_branch(
-        "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation
+        "test_clog_truncate_new", "main", ancestor_start_lsn=lsn_after_truncation
     )
     endpoint2 = env.endpoints.create_start("test_clog_truncate_new")
 
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index dd36190fcd..8b8c970357 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -4,9 +4,8 @@ from fixtures.neon_fixtures import NeonEnv
 
 def test_compute_catalog(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_config", "empty")
 
-    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
     client = endpoint.http_client()
 
     objects = client.dbs_and_roles()
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 2ef28eb94b..d8ef0b8dbd 100644
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -9,10 +9,9 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 #
 def test_config(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_config", "empty")
 
     # change config
-    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
 
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py
index f741a9fc87..af643f45d7 100644
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -17,9 +17,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
     if env.pg_version == PgVersion.V14 and strategy == "wal_log":
         pytest.skip("wal_log strategy not supported on PostgreSQL 14")
 
-    env.neon_cli.create_branch("test_createdb", "empty")
-
-    endpoint = env.endpoints.create_start("test_createdb")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -33,7 +31,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
         lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create a branch
-    env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch("test_createdb2", "main", ancestor_start_lsn=lsn)
     endpoint2 = env.endpoints.create_start("test_createdb2")
 
     # Test that you can connect to the new database on both branches
@@ -62,8 +60,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
 #
 def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_dropdb", "empty")
-    endpoint = env.endpoints.create_start("test_dropdb")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         cur.execute("CREATE DATABASE foodb")
@@ -80,14 +77,10 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
         lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create two branches before and after database drop.
-    env.neon_cli.create_branch(
-        "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop
-    )
+    env.neon_cli.create_branch("test_before_dropdb", "main", ancestor_start_lsn=lsn_before_drop)
     endpoint_before = env.endpoints.create_start("test_before_dropdb")
 
-    env.neon_cli.create_branch(
-        "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop
-    )
+    env.neon_cli.create_branch("test_after_dropdb", "main", ancestor_start_lsn=lsn_after_drop)
     endpoint_after = env.endpoints.create_start("test_after_dropdb")
 
     # Test that database exists on the branch before drop
diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py
index 17d9824f52..d6f138e126 100644
--- a/test_runner/regress/test_createuser.py
+++ b/test_runner/regress/test_createuser.py
@@ -7,8 +7,7 @@ from fixtures.utils import query_scalar
 #
 def test_createuser(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_createuser", "empty")
-    endpoint = env.endpoints.create_start("test_createuser")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -19,7 +18,7 @@ def test_createuser(neon_simple_env: NeonEnv):
         lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create a branch
-    env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch("test_createuser2", "main", ancestor_start_lsn=lsn)
     endpoint2 = env.endpoints.create_start("test_createuser2")
 
     # Test that you can connect to new branch as a new user
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 50da673d87..65f310c27a 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -290,9 +290,8 @@ def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str):
 # Here we test the latter. The first one is tested in test_ddl_forwarding
 def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty")
     endpoint = env.endpoints.create_start(
-        "test_ddl_forwarding_invalid_db",
+        "main",
         # Some non-existent url
         config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"],
     )
diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py
index 5231dedcda..0217c9ac7b 100644
--- a/test_runner/regress/test_explain_with_lfc_stats.py
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -10,11 +10,9 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
     cache_dir = Path(env.repo_dir) / "file_cache"
     cache_dir.mkdir(exist_ok=True)
 
-    branchname = "test_explain_with_lfc_stats"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
     endpoint = env.endpoints.create_start(
-        branchname,
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 1b2c7f808f..cb0b30d9c6 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -16,9 +16,8 @@ from fixtures.neon_fixtures import NeonEnv, PgBin
 @pytest.mark.timeout(600)
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_lfc_resize", "empty")
     endpoint = env.endpoints.create_start(
-        "test_lfc_resize",
+        "main",
         config_lines=[
             "neon.file_cache_path='file.cache'",
             "neon.max_file_cache_size=512MB",
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index 4c53e4e2fd..4a3a949d1a 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -12,11 +12,9 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
     cache_dir = Path(env.repo_dir) / "file_cache"
     cache_dir.mkdir(exist_ok=True)
 
-    branchname = "test_approximate_working_set_size"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
     endpoint = env.endpoints.create_start(
-        branchname,
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 3c404c3b23..9c38200937 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -5,7 +5,7 @@ import threading
 import time
 from typing import List
 
-from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.utils import query_scalar
 
 
@@ -15,11 +15,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
     cache_dir = os.path.join(env.repo_dir, "file_cache")
     os.mkdir(cache_dir)
 
-    env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
-    env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
-
     endpoint = env.endpoints.create_start(
-        "test_local_file_cache_unlink",
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index f83a833dda..15a3719e0b 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -36,10 +36,8 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_logical_replication", config_lines=["log_statement=all"]
-    )
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"])
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -185,10 +183,9 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
 
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_logical_replication", "empty")
     # set low neon.logical_replication_max_snap_files
     endpoint = env.endpoints.create_start(
-        "test_logical_replication",
+        "main",
         config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"],
     )
 
@@ -472,7 +469,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env
-    env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty")
+    env.neon_cli.create_branch("test_replication_shutdown_publisher", "main")
     pub = env.endpoints.create("test_replication_shutdown_publisher")
 
     env.neon_cli.create_branch("test_replication_shutdown_subscriber")
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index bdc5ca907e..e88e56d030 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -9,9 +9,8 @@ if TYPE_CHECKING:
 
 def test_migrations(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_migrations", "empty")
 
-    endpoint = env.endpoints.create("test_migrations")
+    endpoint = env.endpoints.create("main")
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py
index 88f7a5db59..8a00f8835f 100644
--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -14,8 +14,7 @@ from fixtures.utils import query_scalar
 #
 def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_multixact", "empty")
-    endpoint = env.endpoints.create_start("test_multixact")
+    endpoint = env.endpoints.create_start("main")
 
     cur = endpoint.connect().cursor()
     cur.execute(
@@ -73,7 +72,9 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     assert int(next_multixact_id) > int(next_multixact_id_old)
 
     # Branch at this point
-    env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch(
+        "test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn
+    )
     endpoint_new = env.endpoints.create_start("test_multixact_new")
 
     next_multixact_id_new = endpoint_new.safe_psql(
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index fd31df84da..7825ec772c 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -6,7 +6,7 @@ from fixtures.utils import wait_until
 
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
+    env.neon_cli.create_branch("test_neon_superuser_publisher", "main")
     pub = env.endpoints.create("test_neon_superuser_publisher")
 
     env.neon_cli.create_branch("test_neon_superuser_subscriber")
diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py
index b33e387a66..a5037e8694 100644
--- a/test_runner/regress/test_parallel_copy.py
+++ b/test_runner/regress/test_parallel_copy.py
@@ -41,8 +41,7 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int):
 # Load data into one table with COPY TO from 5 parallel connections
 def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_parallel_copy", "empty")
-    endpoint = env.endpoints.create_start("test_parallel_copy")
+    endpoint = env.endpoints.create_start("main")
 
     # Create test table
     conn = endpoint.connect()
diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py
index bad2e5865e..c6b4eff516 100644
--- a/test_runner/regress/test_pg_query_cancellation.py
+++ b/test_runner/regress/test_pg_query_cancellation.py
@@ -42,11 +42,9 @@ def test_cancellations(neon_simple_env: NeonEnv):
     ps_http = ps.http_client()
     ps_http.is_testing_enabled_or_skip()
 
-    env.neon_cli.create_branch("test_config", "empty")
-
     # We don't want to have any racy behaviour with autovacuum IOs
     ep = env.endpoints.create_start(
-        "test_config",
+        "main",
         config_lines=[
             "autovacuum = off",
             "shared_buffers = 128MB",
diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
index 8e80efd9ba..1990d69b6a 100644
--- a/test_runner/regress/test_pg_waldump.py
+++ b/test_runner/regress/test_pg_waldump.py
@@ -22,8 +22,8 @@ def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir):
 def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
     env = neon_simple_env
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty")
-    endpoint = env.endpoints.create_start("test_pg_waldump")
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main")
 
     cur = endpoint.connect().cursor()
     cur.execute(
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 1ac881553f..78798c5abf 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -15,12 +15,8 @@ extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"]
 #
 def test_read_validation(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_read_validation", "empty")
-
-    endpoint = env.endpoints.create_start(
-        "test_read_validation",
-    )
 
+    endpoint = env.endpoints.create_start("main")
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
             for e in extensions:
@@ -131,13 +127,9 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
 def test_read_validation_neg(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_read_validation_neg", "empty")
-
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
-    endpoint = env.endpoints.create_start(
-        "test_read_validation_neg",
-    )
+    endpoint = env.endpoints.create_start("main")
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 368f60127e..347fc3a04d 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -22,8 +22,7 @@ from fixtures.utils import query_scalar
 #
 def test_readonly_node(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_readonly_node", "empty")
-    endpoint_main = env.endpoints.create_start("test_readonly_node")
+    endpoint_main = env.endpoints.create_start("main")
 
     env.pageserver.allowed_errors.extend(
         [
@@ -74,12 +73,12 @@ def test_readonly_node(neon_simple_env: NeonEnv):
 
     # Create first read-only node at the point where only 100 rows were inserted
     endpoint_hundred = env.endpoints.create_start(
-        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a
+        branch_name="main", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a
     )
 
     # And another at the point where 200100 rows were inserted
     endpoint_more = env.endpoints.create_start(
-        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b
+        branch_name="main", endpoint_id="ep-readonly_node_more", lsn=lsn_b
     )
 
     # On the 'hundred' node, we should see only 100 rows
@@ -100,7 +99,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
 
     # Check creating a node at segment boundary
     endpoint = env.endpoints.create_start(
-        branch_name="test_readonly_node",
+        branch_name="main",
         endpoint_id="ep-branch_segment_boundary",
         lsn=Lsn("0/3000000"),
     )
@@ -112,7 +111,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     with pytest.raises(Exception, match="invalid basebackup lsn"):
         # compute node startup with invalid LSN should fail
         env.endpoints.create_start(
-            branch_name="test_readonly_node",
+            branch_name="main",
             endpoint_id="ep-readonly_node_preinitdb",
             lsn=Lsn("0/42"),
         )
@@ -218,14 +217,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 # Similar test, but with more data, and we force checkpoints
 def test_timetravel(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    pageserver_http_client = env.pageserver.http_client()
-    env.neon_cli.create_branch("test_timetravel", "empty")
-    endpoint = env.endpoints.create_start("test_timetravel")
-
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
     client = env.pageserver.http_client()
-
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    endpoint = env.endpoints.create_start("main")
 
     lsns = []
 
@@ -249,7 +244,7 @@ def test_timetravel(neon_simple_env: NeonEnv):
         wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
 
         # run checkpoint manually to force a new layer file
-        pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id)
 
     ##### Restart pageserver
     env.endpoints.stop_all()
@@ -258,7 +253,7 @@ def test_timetravel(neon_simple_env: NeonEnv):
 
     for i, lsn in lsns:
         endpoint_old = env.endpoints.create_start(
-            branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn
+            branch_name="main", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn
         )
         with endpoint_old.cursor() as cur:
             assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 10cb00c780..82075bd723 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -9,8 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 # CLOG.
 def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_subxacts", "empty")
-    endpoint = env.endpoints.create_start("test_subxacts")
+    endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 328131cd08..711fcd5016 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -68,10 +68,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
 
     # construct pair of branches to validate that pageserver prohibits
     # deletion of ancestor timelines when they have child branches
-    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty")
+    parent_timeline_id = env.neon_cli.create_branch(
+        new_branch_name="test_ancestor_branch_delete_parent", ancestor_branch_name="main"
+    )
 
     leaf_timeline_id = env.neon_cli.create_branch(
-        "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
+        new_branch_name="test_ancestor_branch_delete_branch1",
+        ancestor_branch_name="test_ancestor_branch_delete_parent",
     )
 
     timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 9bf5f8680b..f2265dd3d9 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -36,7 +36,7 @@ from fixtures.utils import get_timeline_dir_size, wait_until
 
 def test_timeline_size(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "main")
 
     client = env.pageserver.http_client()
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
@@ -68,7 +68,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
 
 def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "main")
 
     client = env.pageserver.http_client()
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index dd76689008..ea900b07b8 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -9,10 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
 #
 def test_twophase(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_twophase", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_twophase", config_lines=["max_prepared_transactions=5"]
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"])
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -56,7 +53,7 @@ def test_twophase(neon_simple_env: NeonEnv):
     assert len(twophase_files) == 2
 
     # Create a branch with the transaction in prepared state
-    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase")
+    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main")
 
     # Start compute on the new branch
     endpoint2 = env.endpoints.create_start(
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index 137d28b9fa..deba29536c 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -9,8 +9,7 @@ from fixtures.pg_version import PgVersion
 #
 def test_unlogged(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_unlogged", "empty")
-    endpoint = env.endpoints.create_start("test_unlogged")
+    endpoint = env.endpoints.create_start("main")
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -22,7 +21,7 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur.execute("INSERT INTO iut (id) values (42);")
 
     # create another compute to fetch inital empty contents from pageserver
-    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
+    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "main")
     endpoint2 = env.endpoints.create_start("test_unlogged_basebackup")
 
     conn2 = endpoint2.connect()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 7272979c4a..3075211ada 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -13,8 +13,7 @@ from fixtures.utils import query_scalar
 def test_vm_bit_clear(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_vm_bit_clear", "empty")
-    endpoint = env.endpoints.create_start("test_vm_bit_clear")
+    endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -58,7 +57,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1")
 
     # Branch at this point, to test that later
-    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
+    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "main")
 
     # Clear the buffer cache, to force the VM page to be re-fetched from
     # the page server
diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py
index 7e8aef5a5f..d710b53528 100644
--- a/test_runner/test_broken.py
+++ b/test_runner/test_broken.py
@@ -23,8 +23,7 @@ run_broken = pytest.mark.skipif(
 def test_broken(neon_simple_env: NeonEnv, pg_bin):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_broken", "empty")
-    env.endpoints.create_start("test_broken")
+    env.endpoints.create_start("main")
     log.info("postgres is running")
 
     log.info("THIS NEXT COMMAND WILL FAIL:")

From e158df4e86318fa3fd5ee9516f3e7ac91dd14283 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:03:27 +0800
Subject: [PATCH 119/159] feat(pageserver): split delta writer automatically
 determines key range (#8850)

close https://github.com/neondatabase/neon/issues/8838

## Summary of changes

This patch modifies the split delta layer writer to avoid taking
start_key and end_key when creating/finishing the layer writer. The
start_key for the delta layers will be the first key provided to the
layer writer, and the end_key would be the `last_key.next()`. This
simplifies the delta layer writer API.

On that, the layer key hack is removed. Image layers now use the full
key range, and delta layers use the first/last key provided by the user.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs                |   9 -
 pageserver/src/tenant.rs                      |   6 +-
 .../src/tenant/storage_layer/split_writer.rs  | 158 ++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs  |   7 +-
 4 files changed, 109 insertions(+), 71 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 77d744e4da..8929ccb41d 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -263,15 +263,6 @@ impl Key {
         field5: u8::MAX,
         field6: u32::MAX,
     };
-    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
-    pub const NON_L0_MAX: Key = Key {
-        field1: u8::MAX,
-        field2: u32::MAX,
-        field3: u32::MAX,
-        field4: u32::MAX,
-        field5: u8::MAX,
-        field6: u32::MAX - 1,
-    };
 
     pub fn from_hex(s: &str) -> Result<Self> {
         if s.len() != 36 {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fd2520a42e..c6f0e48101 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7091,13 +7091,13 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: Key::MIN..Key::MAX,
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
-                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
+                // The delta layer below the horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: get_key(3)..get_key(4),
                     lsn_range: Lsn(0x30)..Lsn(0x48),
                     is_delta: true
                 },
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 7c1ac863bf..40a6a77a50 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -188,7 +188,7 @@ impl SplitImageLayerWriter {
             .await
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    /// This function will be deprecated with #8841.
     pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
         Ok((self.generated_layers, self.inner))
     }
@@ -204,7 +204,7 @@ impl SplitImageLayerWriter {
 /// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
+    inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
     generated_layers: Vec<SplitWriterResult>,
     conf: &'static PageServerConf,
@@ -212,7 +212,6 @@ pub struct SplitDeltaLayerWriter {
     tenant_shard_id: TenantShardId,
     lsn_range: Range<Lsn>,
     last_key_written: Key,
-    start_key: Key,
 }
 
 impl SplitDeltaLayerWriter {
@@ -220,29 +219,18 @@ impl SplitDeltaLayerWriter {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        start_key: Key,
         lsn_range: Range<Lsn>,
         target_layer_size: u64,
-        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
+            inner: None,
             generated_layers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
             lsn_range,
             last_key_written: Key::MIN,
-            start_key,
         })
     }
 
@@ -265,9 +253,26 @@ impl SplitDeltaLayerWriter {
         //
         // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
         // strategy. https://github.com/neondatabase/neon/issues/8837
+
+        if self.inner.is_none() {
+            self.inner = Some((
+                key,
+                DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    key,
+                    self.lsn_range.clone(),
+                    ctx,
+                )
+                .await?,
+            ));
+        }
+        let (_, inner) = self.inner.as_mut().unwrap();
+
         let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        if inner.num_keys() >= 1
+            && inner.estimated_size() + addition_size_estimation >= self.target_layer_size
         {
             if key != self.last_key_written {
                 let next_delta_writer = DeltaLayerWriter::new(
@@ -279,13 +284,13 @@ impl SplitDeltaLayerWriter {
                     ctx,
                 )
                 .await?;
-                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+                let (start_key, prev_delta_writer) =
+                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
                 let layer_key = PersistentLayerKey {
-                    key_range: self.start_key..key,
+                    key_range: start_key..key,
                     lsn_range: self.lsn_range.clone(),
                     is_delta: true,
                 };
-                self.start_key = key;
                 if discard(&layer_key).await {
                     drop(prev_delta_writer);
                     self.generated_layers
@@ -296,17 +301,18 @@ impl SplitDeltaLayerWriter {
                     self.generated_layers
                         .push(SplitWriterResult::Produced(delta_layer));
                 }
-            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
+            } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
                 anyhow::bail!(
                     "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
                     key,
-                    self.inner.estimated_size()
+                    inner.estimated_size()
                 );
             }
         }
         self.last_key_written = key;
-        self.inner.put_value(key, lsn, val, ctx).await
+        let (_, inner) = self.inner.as_mut().unwrap();
+        inner.put_value(key, lsn, val, ctx).await
     }
 
     pub async fn put_value(
@@ -325,7 +331,6 @@ impl SplitDeltaLayerWriter {
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        end_key: Key,
         discard: D,
     ) -> anyhow::Result<Vec<SplitWriterResult>>
     where
@@ -337,11 +342,15 @@ impl SplitDeltaLayerWriter {
             inner,
             ..
         } = self;
+        let Some((start_key, inner)) = inner else {
+            return Ok(generated_layers);
+        };
         if inner.num_keys() == 0 {
             return Ok(generated_layers);
         }
+        let end_key = self.last_key_written.next();
         let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
+            key_range: start_key..end_key,
             lsn_range: self.lsn_range.clone(),
             is_delta: true,
         };
@@ -360,15 +369,14 @@ impl SplitDeltaLayerWriter {
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        end_key: Key,
     ) -> anyhow::Result<Vec<SplitWriterResult>> {
-        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+        self.finish_with_discard_fn(tline, ctx, |_| async { false })
             .await
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
+    /// This function will be deprecated with #8841.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
+        Ok((self.generated_layers, self.inner.map(|x| x.1)))
     }
 }
 
@@ -432,10 +440,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -460,11 +466,22 @@ mod tests {
             )
             .await
             .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(layers.len(), 1);
+        assert_eq!(
+            layers
+                .into_iter()
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
     }
 
     #[tokio::test]
@@ -501,10 +518,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -533,10 +548,7 @@ mod tests {
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         if discard {
             for layer in image_layers {
                 layer.into_discarded_layer();
@@ -555,6 +567,14 @@ mod tests {
                 .collect_vec();
             assert_eq!(image_layers.len(), N / 512 + 1);
             assert_eq!(delta_layers.len(), N / 512 + 1);
+            assert_eq!(
+                delta_layers.first().unwrap().layer_desc().key_range.start,
+                get_key(0)
+            );
+            assert_eq!(
+                delta_layers.last().unwrap().layer_desc().key_range.end,
+                get_key(N as u32)
+            );
             for idx in 0..image_layers.len() {
                 assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
                 assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
@@ -602,10 +622,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -644,11 +662,35 @@ mod tests {
             )
             .await
             .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(layers.len(), 2);
+        let mut layers_iter = layers.into_iter();
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(1)..get_key(2),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
     }
 
     #[tokio::test]
@@ -668,10 +710,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -689,10 +729,20 @@ mod tests {
                 .await
                 .unwrap();
         }
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(delta_layers.len(), 1);
+        let delta_layer = delta_layers
+            .into_iter()
+            .next()
+            .unwrap()
+            .into_resident_layer();
+        assert_eq!(
+            delta_layer.layer_desc().key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
+                is_delta: true
+            }
+        );
     }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a87b502cd6..0b5c520ba7 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1809,7 +1809,6 @@ impl Timeline {
             .unwrap();
         // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
         // as an L0 layer.
-        let hack_end_key = Key::NON_L0_MAX;
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
         let mut downloaded_layers = Vec::new();
@@ -1855,10 +1854,8 @@ impl Timeline {
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            Key::MIN,
             lowest_retain_lsn..end_lsn,
             self.get_compaction_target_size(),
-            ctx,
         )
         .await?;
 
@@ -1965,7 +1962,7 @@ impl Timeline {
         let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
                 writer
-                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
                     .await?
             } else {
                 let (layers, _) = writer.take()?;
@@ -1978,7 +1975,7 @@ impl Timeline {
 
         let produced_delta_layers = if !dry_run {
             delta_layer_writer
-                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                .finish_with_discard_fn(self, ctx, discard)
                 .await?
         } else {
             let (layers, _) = delta_layer_writer.take()?;

From 982b376ea2e42d45f70c625ec91ac513f9f3a661 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 10 Sep 2024 00:04:00 +0300
Subject: [PATCH 120/159] Update parquet crate to a released version (#8961)

PR #7782 set the dependency in Cargo.toml to 'master', and locked the
version to commit that contained a specific fix, because we needed the
fix before it was included in a versioned release. The fix was later
included in parquet crate version 52.0.0, so we can now switch back to
using a released version. The latest release is 53.0.0, switch straight
to that.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 Cargo.lock                   | 10 +++++---
 Cargo.toml                   |  8 ++----
 proxy/src/context/parquet.rs | 48 ++++++++++++++++++------------------
 workspace_hack/Cargo.toml    |  4 +--
 4 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4fb3ac7223..3ca6acbc3e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3907,8 +3907,9 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "53.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
 dependencies = [
  "ahash",
  "bytes",
@@ -3927,8 +3928,9 @@ dependencies = [
 
 [[package]]
 name = "parquet_derive"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "53.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
 dependencies = [
  "parquet",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index 40e399619d..fd1d4e016c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -123,8 +123,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "51.0.0"
+parquet = { version = "53", default-features = false, features = ["zstd"] }
+parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.16"
@@ -254,10 +254,6 @@ tonic-build = "0.9"
 # Needed to get `tokio-postgres-rustls` to depend on our fork.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
-# bug fixes for UUID
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-
 ################# Binary contents sections
 
 [profile.release]
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index c6f83fd069..fafea2a08f 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -598,15 +598,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1312632, 3, 6000),
+                (1312621, 3, 6000),
+                (1312680, 3, 6000),
+                (1312637, 3, 6000),
+                (1312773, 3, 6000),
+                (1312610, 3, 6000),
+                (1312404, 3, 6000),
+                (1312639, 3, 6000),
+                (437848, 1, 2000)
             ]
         );
 
@@ -638,11 +638,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1208861, 5, 10000),
-                (1208592, 5, 10000),
-                (1208885, 5, 10000),
-                (1208873, 5, 10000),
-                (1209128, 5, 10000)
+                (1203465, 5, 10000),
+                (1203189, 5, 10000),
+                (1203490, 5, 10000),
+                (1203475, 5, 10000),
+                (1203729, 5, 10000)
             ]
         );
 
@@ -667,15 +667,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1312632, 3, 6000),
+                (1312621, 3, 6000),
+                (1312680, 3, 6000),
+                (1312637, 3, 6000),
+                (1312773, 3, 6000),
+                (1312610, 3, 6000),
+                (1312404, 3, 6000),
+                (1312639, 3, 6000),
+                (437848, 1, 2000)
             ]
         );
 
@@ -712,7 +712,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
+            [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 411ca81032..140c43639e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+parquet = { version = "53", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -116,7 +116,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+parquet = { version = "53", default-features = false, features = ["zstd"] }
 proc-macro2 = { version = "1" }
 prost = { version = "0.11" }
 quote = { version = "1" }

From 842be0ba74c4c6e4245c29c3fffae4401d282c4a Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 10 Sep 2024 00:01:52 +0200
Subject: [PATCH 121/159] Specialize WalIngest on PostgreSQL version (#8904)

The current code assumes that most of this functionality is
version-independent, which is only true up to v16 - PostgreSQL 17 has a
new field in CheckPoint that we need to keep track of.

This basically removes the file-level dependency on v14, and replaces it
with switches that load the correct version dependencies where required.
---
 libs/postgres_ffi/build.rs   |   1 +
 libs/postgres_ffi/src/lib.rs | 104 ++++++++++++++++
 pageserver/src/walingest.rs  | 235 +++++++++++++++++++++--------------
 3 files changed, 249 insertions(+), 91 deletions(-)

diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index d3e3ce648f..a346390f3d 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -121,6 +121,7 @@ fn main() -> anyhow::Result<()> {
             .allowlist_type("XLogPageHeaderData")
             .allowlist_type("XLogLongPageHeaderData")
             .allowlist_var("XLOG_PAGE_MAGIC")
+            .allowlist_var("PG_MAJORVERSION_NUM")
             .allowlist_var("PG_CONTROL_FILE_SIZE")
             .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
             .allowlist_type("PageHeaderData")
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 9acb105e9b..f18e0c603b 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -44,6 +44,9 @@ macro_rules! postgres_ffi {
             // Re-export some symbols from bindings
             pub use bindings::DBState_DB_SHUTDOWNED;
             pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
+
+            pub const ZERO_CHECKPOINT: bytes::Bytes =
+                bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
         }
     };
 }
@@ -106,6 +109,107 @@ macro_rules! dispatch_pgversion {
     };
 }
 
+#[macro_export]
+macro_rules! enum_pgversion_dispatch {
+    ($name:expr, $typ:ident, $bind:ident, $code:block) => {
+        enum_pgversion_dispatch!(
+            name = $name,
+            bind = $bind,
+            typ = $typ,
+            code = $code,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        )
+    };
+    (name = $name:expr,
+     bind = $bind:ident,
+     typ = $typ:ident,
+     code = $code:block,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
+        match $name {
+            $(
+            self::$typ::$variant($bind) => {
+                use $crate::$md as pgv;
+                $code
+            }
+            ),+,
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! enum_pgversion {
+    {$name:ident, pgv :: $t:ident} => {
+        enum_pgversion!{
+            name = $name,
+            typ = $t,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        }
+    };
+    {$name:ident, pgv :: $p:ident :: $t:ident} => {
+        enum_pgversion!{
+            name = $name,
+            path = $p,
+            typ = $t,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        }
+    };
+    {name = $name:ident,
+     typ = $t:ident,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
+        pub enum $name {
+            $($variant ( $crate::$md::$t )),+
+        }
+        impl self::$name {
+            pub fn pg_version(&self) -> u32 {
+                enum_pgversion_dispatch!(self, $name, _ign, {
+                    pgv::bindings::PG_MAJORVERSION_NUM
+                })
+            }
+        }
+        $(
+        impl Into<self::$name> for $crate::$md::$t {
+            fn into(self) -> self::$name {
+                self::$name::$variant (self)
+            }
+        }
+        )+
+    };
+    {name = $name:ident,
+     path = $p:ident,
+     typ = $t:ident,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
+        pub enum $name {
+            $($variant ($crate::$md::$p::$t)),+
+        }
+        impl $name {
+            pub fn pg_version(&self) -> u32 {
+                enum_pgversion_dispatch!(self, $name, _ign, {
+                    pgv::bindings::PG_MAJORVERSION_NUM
+                })
+            }
+        }
+        $(
+        impl Into<$name> for $crate::$md::$p::$t {
+            fn into(self) -> $name {
+                $name::$variant (self)
+            }
+        }
+        )+
+    };
+}
+
 pub mod pg_constants;
 pub mod relfile_utils;
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 2d3841881b..39bc9e385f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,9 +25,7 @@ use std::time::Duration;
 use std::time::SystemTime;
 
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
-use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
-use postgres_ffi::TimestampTz;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 
 use anyhow::{bail, Context, Result};
@@ -48,16 +46,31 @@ use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
-use postgres_ffi::v14::xlog_utils::*;
-use postgres_ffi::v14::CheckPoint;
 use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
+use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 
+enum_pgversion! {CheckPoint, pgv::CheckPoint}
+
+impl CheckPoint {
+    fn encode(&self) -> Result<Bytes, SerializeError> {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() })
+    }
+
+    fn update_next_xid(&mut self, xid: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) })
+    }
+
+    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, {
+            cp.update_next_multixid(multi_xid, multi_offset)
+        })
+    }
+}
+
 pub struct WalIngest {
     shard: ShardIdentity,
-    pg_version: u32,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
     warn_ingest_lag: WarnIngestLag,
@@ -78,12 +91,16 @@ impl WalIngest {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
-        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-        trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+        let pgversion = timeline.pg_version;
+
+        let checkpoint = dispatch_pgversion!(pgversion, {
+            let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+            trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+            <pgv::CheckPoint as Into<CheckPoint>>::into(checkpoint)
+        });
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
-            pg_version: timeline.pg_version,
             checkpoint,
             checkpoint_modified: false,
             warn_ingest_lag: WarnIngestLag {
@@ -117,7 +134,7 @@ impl WalIngest {
 
         modification.set_lsn(lsn)?;
 
-        if decoded.is_dbase_create_copy(self.pg_version) {
+        if decoded.is_dbase_create_copy(pg_version) {
             // Records of this type should always be preceded by a commit(), as they
             // rely on reading data pages back from the Timeline.
             assert!(!modification.has_dirty_data_pages());
@@ -337,70 +354,67 @@ impl WalIngest {
             pg_constants::RM_XLOG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
-                if info == pg_constants::XLOG_NEXTOID {
-                    let next_oid = buf.get_u32_le();
-                    if self.checkpoint.nextOid != next_oid {
-                        self.checkpoint.nextOid = next_oid;
+                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                    if info == pg_constants::XLOG_NEXTOID {
+                        let next_oid = buf.get_u32_le();
+                        if cp.nextOid != next_oid {
+                            cp.nextOid = next_oid;
+                            self.checkpoint_modified = true;
+                        }
+                    } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+                        || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
+                        buf.copy_to_slice(&mut checkpoint_bytes);
+                        let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+                        trace!(
+                            "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                            xlog_checkpoint.oldestXid,
+                            cp.oldestXid
+                        );
+                        if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
+                            cp.oldestXid = xlog_checkpoint.oldestXid;
+                        }
+                        trace!(
+                            "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                            xlog_checkpoint.oldestActiveXid,
+                            cp.oldestActiveXid
+                        );
+
+                        // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                        // because at shutdown, all in-progress transactions will implicitly
+                        // end. Postgres startup code knows that, and allows hot standby to start
+                        // immediately from a shutdown checkpoint.
+                        //
+                        // In Neon, Postgres hot standby startup always behaves as if starting from
+                        // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                        // instead of overwriting self.checkpoint.oldestActiveXid with
+                        // InvalidTransactionid from the checkpoint WAL record, update it to a
+                        // proper value, knowing that there are no in-progress transactions at this
+                        // point, except for prepared transactions.
+                        //
+                        // See also the neon code changes in the InitWalRecovery() function.
+                        if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                            && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                        {
+                            let mut oldest_active_xid = cp.nextXid.value as u32;
+                            for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                    oldest_active_xid = xid;
+                                }
+                            }
+                            cp.oldestActiveXid = oldest_active_xid;
+                        } else {
+                            cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                        }
+
+                        // Write a new checkpoint key-value pair on every checkpoint record, even
+                        // if nothing really changed. Not strictly required, but it seems nice to
+                        // have some trace of the checkpoint records in the layer files at the same
+                        // LSNs.
                         self.checkpoint_modified = true;
                     }
-                } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-                    || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                {
-                    let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
-                    buf.copy_to_slice(&mut checkpoint_bytes);
-                    let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-                    trace!(
-                        "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                        xlog_checkpoint.oldestXid,
-                        self.checkpoint.oldestXid
-                    );
-                    if (self
-                        .checkpoint
-                        .oldestXid
-                        .wrapping_sub(xlog_checkpoint.oldestXid) as i32)
-                        < 0
-                    {
-                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                    }
-                    trace!(
-                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
-                        xlog_checkpoint.oldestActiveXid,
-                        self.checkpoint.oldestActiveXid
-                    );
-
-                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
-                    // because at shutdown, all in-progress transactions will implicitly
-                    // end. Postgres startup code knows that, and allows hot standby to start
-                    // immediately from a shutdown checkpoint.
-                    //
-                    // In Neon, Postgres hot standby startup always behaves as if starting from
-                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
-                    // instead of overwriting self.checkpoint.oldestActiveXid with
-                    // InvalidTransactionid from the checkpoint WAL record, update it to a
-                    // proper value, knowing that there are no in-progress transactions at this
-                    // point, except for prepared transactions.
-                    //
-                    // See also the neon code changes in the InitWalRecovery() function.
-                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
-                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                    {
-                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
-                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                oldest_active_xid = xid;
-                            }
-                        }
-                        self.checkpoint.oldestActiveXid = oldest_active_xid;
-                    } else {
-                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
-                    }
-
-                    // Write a new checkpoint key-value pair on every checkpoint record, even
-                    // if nothing really changed. Not strictly required, but it seems nice to
-                    // have some trace of the checkpoint records in the layer files at the same
-                    // LSNs.
-                    self.checkpoint_modified = true;
-                }
+                });
             }
             pg_constants::RM_LOGICALMSG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -424,7 +438,11 @@ impl WalIngest {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
                 if info == pg_constants::XLOG_RUNNING_XACTS {
                     let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
-                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+
+                    enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                        cp.oldestActiveXid = xlrec.oldest_running_xid;
+                    });
+
                     self.checkpoint_modified = true;
                 }
             }
@@ -539,7 +557,7 @@ impl WalIngest {
             && blk.has_image
             && decoded.xl_rmid == pg_constants::RM_XLOG_ID
             && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
             && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
             // do not materialize null pages because them most likely be soon replaced with real data
@@ -1242,12 +1260,17 @@ impl WalIngest {
     fn warn_on_ingest_lag(
         &mut self,
         conf: &crate::config::PageServerConf,
-        wal_timestmap: TimestampTz,
+        wal_timestamp: TimestampTz,
     ) {
         debug_assert_current_span_has_tenant_and_timeline_id();
         let now = SystemTime::now();
         let rate_limits = &mut self.warn_ingest_lag;
-        match try_from_pg_timestamp(wal_timestmap) {
+
+        let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, {
+            pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp)
+        });
+
+        match ts {
             Ok(ts) => {
                 match now.duration_since(ts) {
                     Ok(lag) => {
@@ -1257,7 +1280,7 @@ impl WalIngest {
                                 warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                             })
                         }
-                    },
+                    }
                     Err(e) => {
                         let delta_t = e.duration();
                         // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
@@ -1271,7 +1294,6 @@ impl WalIngest {
                         }
                     }
                 };
-
             }
             Err(error) => {
                 rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
@@ -1379,14 +1401,17 @@ impl WalIngest {
         // truncated, but a checkpoint record with the updated values isn't written until
         // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
         // so we keep the oldestXid and oldestXidDB up-to-date.
-        self.checkpoint.oldestXid = xlrec.oldest_xid;
-        self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
+        enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+            cp.oldestXid = xlrec.oldest_xid;
+            cp.oldestXidDB = xlrec.oldest_xid_db;
+        });
         self.checkpoint_modified = true;
 
         // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
 
         let latest_page_number =
-            self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+            enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32
+                / pg_constants::CLOG_XACTS_PER_PAGE;
 
         // Now delete all segments containing pages between xlrec.pageno
         // and latest_page_number.
@@ -1394,7 +1419,9 @@ impl WalIngest {
         // First, make an important safety check:
         // the current endpoint page must not be eligible for removal.
         // See SimpleLruTruncate() in slru.c
-        if clogpage_precedes(latest_page_number, xlrec.pageno) {
+        if dispatch_pgversion!(modification.tline.pg_version, {
+            pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno)
+        }) {
             info!("could not truncate directory pg_xact apparent wraparound");
             return Ok(());
         }
@@ -1411,7 +1438,12 @@ impl WalIngest {
             .await?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
-            if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
+
+            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno)
+            });
+
+            if may_delete {
                 modification
                     .drop_slru_segment(SlruKind::Clog, segno, ctx)
                     .await?;
@@ -1530,14 +1562,23 @@ impl WalIngest {
         xlrec: &XlMultiXactTruncate,
         ctx: &RequestContext,
     ) -> Result<()> {
-        self.checkpoint.oldestMulti = xlrec.end_trunc_off;
-        self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
+        let (maxsegment, startsegment, endsegment) =
+            enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                cp.oldestMulti = xlrec.end_trunc_off;
+                cp.oldestMultiDB = xlrec.oldest_multi_db;
+                let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(
+                    pg_constants::MAX_MULTIXACT_OFFSET,
+                );
+                let startsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb);
+                let endsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb);
+                (maxsegment, startsegment, endsegment)
+            });
+
         self.checkpoint_modified = true;
 
         // PerformMembersTruncation
-        let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
-        let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb);
-        let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb);
         let mut segment: i32 = startsegment;
 
         // Delete all the segments except the last one. The last segment can still
@@ -1811,11 +1852,23 @@ mod tests {
         // TODO
     }
 
-    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
+    #[tokio::test]
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+        for i in 14..=16 {
+            dispatch_pgversion!(i, {
+                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
+            });
+        }
+
+        Ok(())
+    }
 
     async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
-        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
+        m.put_checkpoint(dispatch_pgversion!(
+            tline.pg_version,
+            pgv::ZERO_CHECKPOINT.clone()
+        ))?;
         m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
         m.commit(ctx).await?;
         let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

From 97582178cb576f8b68acc53535adf8918d7dbd94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 10 Sep 2024 02:40:00 +0200
Subject: [PATCH 122/159] Remove async_trait from the Handler trait (#8958)

Newest attempt to remove `async_trait` from the Handler trait.

Earlier attempts were in #7301 and #8296 .
---
 libs/postgres_backend/src/lib.rs             |  5 +-
 libs/postgres_backend/tests/simple_select.rs |  1 -
 pageserver/src/page_service.rs               |  1 -
 proxy/src/console/mgmt.rs                    |  2 +-
 safekeeper/src/handler.rs                    | 80 ++++++++++----------
 5 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 600f1d728c..8ea4b93fb1 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -81,17 +81,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
     )
 }
 
-#[async_trait::async_trait]
 pub trait Handler<IO> {
     /// Handle single query.
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care). It will also flush out the output buffer.
-    async fn process_query(
+    fn process_query(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         query_string: &str,
-    ) -> Result<(), QueryError>;
+    ) -> impl Future<Output = Result<(), QueryError>>;
 
     /// Called on startup packet receival, allows to process params.
     ///
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 7ec85f0dbe..900083ea7f 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -23,7 +23,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {
 
 struct TestHandler {}
 
-#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
     // return single col 'hey' for any query
     async fn process_query(
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 39c6a6fb74..9261b7481d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,6 @@ impl PageServerHandler {
     }
 }
 
-#[async_trait::async_trait]
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
     IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 2ed4f5f206..ee5f83ee76 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -78,7 +78,7 @@ pub(crate) type ComputeReady = DatabaseInfo;
 
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
-#[async_trait::async_trait]
+
 impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
     async fn process_query(
         &mut self,
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 2c519433ef..3f00b69cde 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,6 +2,7 @@
 //! protocol commands.
 
 use anyhow::Context;
+use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -95,7 +96,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
     }
 }
 
-#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
     for SafekeeperPostgresHandler
 {
@@ -197,49 +197,51 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
         Ok(())
     }
 
-    async fn process_query(
+    fn process_query(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         query_string: &str,
-    ) -> Result<(), QueryError> {
-        if query_string
-            .to_ascii_lowercase()
-            .starts_with("set datestyle to ")
-        {
-            // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-            return Ok(());
-        }
-
-        let cmd = parse_cmd(query_string)?;
-        let cmd_str = cmd_to_string(&cmd);
-
-        let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
-
-        info!("got query {:?}", query_string);
-
-        let tenant_id = self.tenant_id.context("tenantid is required")?;
-        let timeline_id = self.timeline_id.context("timelineid is required")?;
-        self.check_permission(Some(tenant_id))?;
-        self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
-
-        match cmd {
-            SafekeeperPostgresCommand::StartWalPush => {
-                self.handle_start_wal_push(pgb)
-                    .instrument(info_span!("WAL receiver"))
-                    .await
+    ) -> impl Future<Output = Result<(), QueryError>> {
+        Box::pin(async move {
+            if query_string
+                .to_ascii_lowercase()
+                .starts_with("set datestyle to ")
+            {
+                // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                return Ok(());
             }
-            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
-                self.handle_start_replication(pgb, start_lsn, term)
-                    .instrument(info_span!("WAL sender"))
-                    .await
+
+            let cmd = parse_cmd(query_string)?;
+            let cmd_str = cmd_to_string(&cmd);
+
+            let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
+
+            info!("got query {:?}", query_string);
+
+            let tenant_id = self.tenant_id.context("tenantid is required")?;
+            let timeline_id = self.timeline_id.context("timelineid is required")?;
+            self.check_permission(Some(tenant_id))?;
+            self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
+
+            match cmd {
+                SafekeeperPostgresCommand::StartWalPush => {
+                    self.handle_start_wal_push(pgb)
+                        .instrument(info_span!("WAL receiver"))
+                        .await
+                }
+                SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
+                    self.handle_start_replication(pgb, start_lsn, term)
+                        .instrument(info_span!("WAL sender"))
+                        .await
+                }
+                SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
+                SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
+                SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
+                    handle_json_ctrl(self, pgb, cmd).await
+                }
             }
-            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
-            SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
-            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
-                handle_json_ctrl(self, pgb, cmd).await
-            }
-        }
+        })
     }
 }
 

From 26b5fcdc5077e5f4051f27c2e2d8f82ac5038acb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 10 Sep 2024 12:54:25 +0100
Subject: [PATCH 123/159] reinstate write-path key check (#8973)

## Problem

In https://github.com/neondatabase/neon/pull/8621, validation of keys
during ingest was removed because the places where we actually store
keys are now past the point where we have already converted them to
CompactKey (i128) representation.

## Summary of changes

Reinstate validation at an earlier stage in ingest. This doesn't cover
literally every place we write a key, but it covers most cases where
we're trusting postgres to give us a valid key (i.e. one that doesn't
try and use a custom spacenode).
---
 pageserver/src/pgdatadir_mapping.rs | 49 ++++++++++++++++++++++++-----
 pageserver/src/walingest.rs         |  8 ++---
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 808d4b666e..6dd8851b13 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1205,6 +1205,13 @@ impl<'a> DatadirModification<'a> {
         img: Bytes,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
     }
@@ -1216,14 +1223,34 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
-        self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
+        self.put(key, Value::Image(img));
         Ok(())
     }
 
-    pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
-        self.pending_zero_data_pages
-            .insert(rel_block_to_key(rel, blknum).to_compact());
+    pub(crate) fn put_rel_page_image_zero(
+        &mut self,
+        rel: RelTag,
+        blknum: BlockNumber,
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
         self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
     }
 
     pub(crate) fn put_slru_page_image_zero(
@@ -1231,10 +1258,18 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) {
-        self.pending_zero_data_pages
-            .insert(slru_block_to_key(kind, segno, blknum).to_compact());
+    ) -> anyhow::Result<()> {
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
         self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
     }
 
     /// Call this at the end of each WAL record.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 39bc9e385f..6e15ad81c3 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1222,7 +1222,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                 // Tail of last remaining FSM page has to be zeroed.
                 // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                 fsm_physical_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1244,7 +1244,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                 // Tail of last remaining vm page has to be zeroed.
                 // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, vm_page_no);
+                modification.put_rel_page_image_zero(rel, vm_page_no)?;
                 vm_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1737,7 +1737,7 @@ impl WalIngest {
                     continue;
                 }
 
-                modification.put_rel_page_image_zero(rel, gap_blknum);
+                modification.put_rel_page_image_zero(rel, gap_blknum)?;
             }
         }
         Ok(())
@@ -1803,7 +1803,7 @@ impl WalIngest {
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image_zero(kind, segno, gap_blknum);
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum)?;
             }
         }
         Ok(())

From bae793ffcd90470b26380053fe931a91545798a5 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 10 Sep 2024 15:36:08 +0200
Subject: [PATCH 124/159] proxy: Handle all let underscore instances (#8898)

* Most can be simply replaced
* One instance renamed to _rtchk (return-type check)
---
 proxy/src/cache/endpoints.rs                     |  2 +-
 proxy/src/console/messages.rs                    | 16 ++++++++--------
 proxy/src/context.rs                             | 10 +++++++---
 proxy/src/context/parquet.rs                     |  2 +-
 proxy/src/lib.rs                                 |  2 +-
 proxy/src/proxy/tests.rs                         |  2 +-
 .../connection_with_credentials_provider.rs      |  7 +++++--
 proxy/src/stream.rs                              | 15 +++++++++------
 proxy/src/url.rs                                 |  2 +-
 9 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index f4762232d8..27121ce89e 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -242,6 +242,6 @@ mod tests {
     #[test]
     fn test() {
         let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
-        let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
+        serde_json::from_str::<ControlPlaneEventKey>(s).unwrap();
     }
 }
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index a48c7316f6..9b66333cd4 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -395,7 +395,7 @@ mod tests {
                 }
             }
         });
-        let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<KickSession<'_>>(&json.to_string())?;
 
         Ok(())
     }
@@ -403,7 +403,7 @@ mod tests {
     #[test]
     fn parse_db_info() -> anyhow::Result<()> {
         // with password
-        let _: DatabaseInfo = serde_json::from_value(json!({
+        serde_json::from_value::<DatabaseInfo>(json!({
             "host": "localhost",
             "port": 5432,
             "dbname": "postgres",
@@ -413,7 +413,7 @@ mod tests {
         }))?;
 
         // without password
-        let _: DatabaseInfo = serde_json::from_value(json!({
+        serde_json::from_value::<DatabaseInfo>(json!({
             "host": "localhost",
             "port": 5432,
             "dbname": "postgres",
@@ -422,7 +422,7 @@ mod tests {
         }))?;
 
         // new field (forward compatibility)
-        let _: DatabaseInfo = serde_json::from_value(json!({
+        serde_json::from_value::<DatabaseInfo>(json!({
             "host": "localhost",
             "port": 5432,
             "dbname": "postgres",
@@ -441,7 +441,7 @@ mod tests {
             "address": "0.0.0.0",
             "aux": dummy_aux(),
         });
-        let _: WakeCompute = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<WakeCompute>(&json.to_string())?;
         Ok(())
     }
 
@@ -451,18 +451,18 @@ mod tests {
         let json = json!({
             "role_secret": "secret",
         });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
         });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
             "project_id": "project",
         });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
 
         Ok(())
     }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 72e1fa1cee..c013218ad9 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -6,7 +6,7 @@ use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{field::display, info, info_span, Span};
+use tracing::{debug, field::display, info, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
@@ -362,7 +362,9 @@ impl RequestMonitoringInner {
                 });
         }
         if let Some(tx) = self.sender.take() {
-            let _: Result<(), _> = tx.send(RequestData::from(&*self));
+            tx.send(RequestData::from(&*self))
+                .inspect_err(|e| debug!("tx send failed: {e}"))
+                .ok();
         }
     }
 
@@ -371,7 +373,9 @@ impl RequestMonitoringInner {
         // Here we log the length of the session.
         self.disconnect_timestamp = Some(Utc::now());
         if let Some(tx) = self.disconnect_sender.take() {
-            let _: Result<(), _> = tx.send(RequestData::from(&*self));
+            tx.send(RequestData::from(&*self))
+                .inspect_err(|e| debug!("tx send failed: {e}"))
+                .ok();
         }
     }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index fafea2a08f..9f6f83022e 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -290,7 +290,7 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
+        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
     }
 
     Ok(())
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 923d6ae288..0070839aa8 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -3,7 +3,7 @@
 #![deny(
     deprecated,
     future_incompatible,
-    // TODO: consider let_underscore
+    let_underscore,
     nonstandard_style,
     rust_2024_compatibility
 )]
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 4264dbae0f..752d982726 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -268,7 +268,7 @@ async fn keepalive_is_inherited() -> anyhow::Result<()> {
         anyhow::Ok(keepalive)
     });
 
-    let _ = TcpStream::connect(("127.0.0.1", port)).await?;
+    TcpStream::connect(("127.0.0.1", port)).await?;
     assert!(t.await??, "keepalive should be inherited");
 
     Ok(())
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 7d222e2dec..2de66b58b1 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -6,7 +6,7 @@ use redis::{
     ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
 };
 use tokio::task::JoinHandle;
-use tracing::{error, info};
+use tracing::{debug, error, info};
 
 use super::elasticache::CredentialsProvider;
 
@@ -109,7 +109,10 @@ impl ConnectionWithCredentialsProvider {
             let credentials_provider = credentials_provider.clone();
             let con2 = con.clone();
             let f = tokio::spawn(async move {
-                let _ = Self::keep_connection(con2, credentials_provider).await;
+                Self::keep_connection(con2, credentials_provider)
+                    .await
+                    .inspect_err(|e| debug!("keep_connection failed: {e}"))
+                    .ok();
             });
             self.refresh_token_task = Some(f);
         }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index c14dd18afe..e2fc73235e 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -12,6 +12,7 @@ use std::{io, task};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
+use tracing::debug;
 
 /// Stream wrapper which implements libpq's protocol.
 ///
@@ -138,9 +139,10 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         );
 
         // already error case, ignore client IO error
-        let _: Result<_, std::io::Error> = self
-            .write_message(&BeMessage::ErrorResponse(msg, None))
-            .await;
+        self.write_message(&BeMessage::ErrorResponse(msg, None))
+            .await
+            .inspect_err(|e| debug!("write_message failed: {e}"))
+            .ok();
 
         Err(ReportedError {
             source: anyhow::anyhow!(msg),
@@ -164,9 +166,10 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         );
 
         // already error case, ignore client IO error
-        let _: Result<_, std::io::Error> = self
-            .write_message(&BeMessage::ErrorResponse(&msg, None))
-            .await;
+        self.write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await
+            .inspect_err(|e| debug!("write_message failed: {e}"))
+            .ok();
 
         Err(ReportedError {
             source: anyhow::anyhow!(error),
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 28ac7efdfc..270cd7c24d 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -57,7 +57,7 @@ mod tests {
     fn bad_url() {
         let url = "test:foobar";
         url.parse::<url::Url>().expect("unexpected parsing failure");
-        let _ = url.parse::<ApiUrl>().expect_err("should not parse");
+        url.parse::<ApiUrl>().expect_err("should not parse");
     }
 
     #[test]

From cb060548fb2115ca6a57a95c6c947c45fc2095a6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 11 Sep 2024 18:45:34 +0100
Subject: [PATCH 125/159] libs: tweak PageserverUtilization::is_overloaded
 (#8946)

## Problem

Having run in production for a while, we see that nodes are generally
safely oversubscribed by about a factor of 2.

## Summary of changes

Tweak the is_overloaded method to check for utililzation over 200%
rather than over 100%
---
 libs/pageserver_api/src/models/utilization.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 844a0cda5d..641aa51989 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -89,8 +89,19 @@ impl PageserverUtilization {
 
     /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
     /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
+    ///
+    /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
+    /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
     pub fn is_overloaded(score: RawScore) -> bool {
-        score >= Self::UTILIZATION_FULL
+        // Why the factor of two?  This is unscientific but reflects behavior of real systems:
+        // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
+        //   startup and housekeeping jobs nice and responsive.  We can go to double this limit if needed
+        //   until some more nodes are deployed.
+        // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
+        //   hold its biggest timeline fully on disk, which is tends to be an over estimate when
+        //   some tenants are very idle and have dropped layers from disk.  In practice going up to
+        //   double is generally better than giving up and scheduling in a sub-optimal AZ.
+        score >= 2 * Self::UTILIZATION_FULL
     }
 
     pub fn adjust_shard_count_max(&mut self, shard_count: u32) {

From 43846b72fa488b96d37bbc40d691bbf4e4f8fdd3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:40:30 +0300
Subject: [PATCH 126/159] Remove unused "neon_local init --pg-version" arg

It has been unused since commit 8712e1899e, when it stopped creating
the initial timeline.
---
 control_plane/src/bin/neon_local.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1d66532d49..af6545f8d2 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1570,7 +1570,6 @@ fn cli() -> Command {
                         .value_parser(value_parser!(PathBuf))
                         .value_name("config")
                 )
-                .arg(pg_version_arg.clone())
                 .arg(force_arg)
         )
         .subcommand(

From aeca15008c15b211d74536439ff701e533a412ef Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 11 Sep 2024 10:55:41 +0300
Subject: [PATCH 127/159] Remove obsolete and misleading comment

The tenant ID was not actually generated here but in NeonEnvBuilder.
And the "neon_local init" command hasn't been able to generate the
initial tenant since 8712e1899e anyway.
---
 test_runner/fixtures/neon_fixtures.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 60887b9aed..22472559f4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1073,9 +1073,6 @@ class NeonEnv:
         self.pg_distrib_dir = config.pg_distrib_dir
         self.endpoint_counter = 0
         self.storage_controller_config = config.storage_controller_config
-
-        # generate initial tenant ID here instead of letting 'neon init' generate it,
-        # so that we don't need to dig it out of the config file afterwards.
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 

From 0a363c3dce2fbd76f6483f02c2273e3f7b205b3e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 11 Sep 2024 12:38:31 +0300
Subject: [PATCH 128/159] Add --timeline-id option to "neon_local timeline
 branch" command

Makes it consistent with the "timeline create" and "timeline import"
commands, which allowed you to pass the timeline id as argument. This
also makes it unnecessary to parse the timeline ID from the output in
the python function that calls it.
---
 control_plane/src/bin/neon_local.rs   |  4 ++-
 test_runner/fixtures/neon_fixtures.py | 43 ++++++++++-----------------
 2 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index af6545f8d2..144cd647c9 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -640,6 +640,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
         }
         Some(("branch", branch_match)) => {
             let tenant_id = get_tenant_id(branch_match, env)?;
+            let new_timeline_id =
+                parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate());
             let new_branch_name = branch_match
                 .get_one::<String>("branch-name")
                 .ok_or_else(|| anyhow!("No branch name provided"))?;
@@ -658,7 +660,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 .map(|lsn_str| Lsn::from_str(lsn_str))
                 .transpose()
                 .context("Failed to parse ancestor start Lsn from the request")?;
-            let new_timeline_id = TimelineId::generate();
             let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
@@ -1582,6 +1583,7 @@ fn cli() -> Command {
             .subcommand(Command::new("branch")
                 .about("Create a new timeline, using another timeline as a base, copying its data")
                 .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
                 .arg(branch_name_arg.clone())
                 .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
                     .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 22472559f4..1c33d14154 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1518,14 +1518,6 @@ class PageserverPort:
     http: int
 
 
-CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"^Created timeline '(?P<timeline_id>[^']+)'", re.MULTILINE
-)
-TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"\s?(?P<branch_name>[^\s]+)\s\[(?P<timeline_id>[^\]]+)\]", re.MULTILINE
-)
-
-
 class AbstractNeonCli(abc.ABC):
     """
     A typed wrapper around an arbitrary Neon CLI tool.
@@ -1754,6 +1746,9 @@ class NeonCli(AbstractNeonCli):
         tenant_id: Optional[TenantId] = None,
         timeline_id: Optional[TimelineId] = None,
     ) -> TimelineId:
+        if timeline_id is None:
+            timeline_id = TimelineId.generate()
+
         cmd = [
             "timeline",
             "create",
@@ -1761,23 +1756,16 @@ class NeonCli(AbstractNeonCli):
             new_branch_name,
             "--tenant-id",
             str(tenant_id or self.env.initial_tenant),
+            "--timeline-id",
+            str(timeline_id),
             "--pg-version",
             self.env.pg_version,
         ]
 
-        if timeline_id is not None:
-            cmd.extend(["--timeline-id", str(timeline_id)])
-
         res = self.raw_cli(cmd)
         res.check_returncode()
 
-        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
-
-        created_timeline_id = None
-        if matches is not None:
-            created_timeline_id = matches.group("timeline_id")
-
-        return TimelineId(str(created_timeline_id))
+        return timeline_id
 
     def create_branch(
         self,
@@ -1785,12 +1773,17 @@ class NeonCli(AbstractNeonCli):
         ancestor_branch_name: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
         ancestor_start_lsn: Optional[Lsn] = None,
+        new_timeline_id: Optional[TimelineId] = None,
     ) -> TimelineId:
+        if new_timeline_id is None:
+            new_timeline_id = TimelineId.generate()
         cmd = [
             "timeline",
             "branch",
             "--branch-name",
             new_branch_name,
+            "--timeline-id",
+            str(new_timeline_id),
             "--tenant-id",
             str(tenant_id or self.env.initial_tenant),
         ]
@@ -1802,16 +1795,7 @@ class NeonCli(AbstractNeonCli):
         res = self.raw_cli(cmd)
         res.check_returncode()
 
-        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
-
-        created_timeline_id = None
-        if matches is not None:
-            created_timeline_id = matches.group("timeline_id")
-
-        if created_timeline_id is None:
-            raise Exception("could not find timeline id after `neon timeline create` invocation")
-        else:
-            return TimelineId(str(created_timeline_id))
+        return TimelineId(str(new_timeline_id))
 
     def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]:
         """
@@ -1820,6 +1804,9 @@ class NeonCli(AbstractNeonCli):
 
         # main [b49f7954224a0ad25cc0013ea107b54b]
         # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540]
+        TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
+            r"\s?(?P<branch_name>[^\s]+)\s\[(?P<timeline_id>[^\]]+)\]", re.MULTILINE
+        )
         res = self.raw_cli(
             ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)]
         )

From 8dc069037b003e63c77683670be4e965384e794b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 11 Sep 2024 15:01:34 +0300
Subject: [PATCH 129/159] Remove NeonEnvBuilder.start() function

It feels wrong to me to start() from the builder object. Surely the
thing you start is the environment itself, not its configuration.
---
 test_runner/fixtures/neon_fixtures.py                    | 6 +-----
 test_runner/performance/test_storage_controller_scale.py | 2 +-
 test_runner/regress/test_compatibility.py                | 4 ++--
 test_runner/regress/test_sharding.py                     | 6 +++---
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1c33d14154..ee62372871 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -553,10 +553,6 @@ class NeonEnvBuilder:
         self.env = NeonEnv(self)
         return self.env
 
-    def start(self):
-        assert self.env is not None, "environment is not already initialized, call init() first"
-        self.env.start()
-
     def init_start(
         self,
         initial_tenant_conf: Optional[Dict[str, Any]] = None,
@@ -572,7 +568,7 @@ class NeonEnvBuilder:
         Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one.
         """
         env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing)
-        self.start()
+        env.start()
 
         # Prepare the default branch to start the postgres on later.
         # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 297aedfbed..a186bbaceb 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -84,7 +84,7 @@ def test_storage_controller_many_tenants(
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 
     env = neon_env_builder.init_configs()
-    neon_env_builder.start()
+    env.start()
 
     # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
     # of shards are hitting the delayed path.
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 467e5b1734..b559be5f18 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -178,7 +178,7 @@ def test_backward_compatibility(
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
         env.pageserver.allowed_errors.append(ingest_lag_log_line)
-        neon_env_builder.start()
+        env.start()
 
         check_neon_works(
             env,
@@ -265,7 +265,7 @@ def test_forward_compatibility(
         # does not include logs from previous runs
         assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
 
-        neon_env_builder.start()
+        env.start()
 
         # ensure the specified pageserver is running
         assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bfd82242e9..4a84dca399 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -374,7 +374,7 @@ def test_sharding_split_smoke(
     non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
 
     env = neon_env_builder.init_configs(True)
-    neon_env_builder.start()
+    env.start()
     tenant_id = TenantId.generate()
     timeline_id = TimelineId.generate()
     env.neon_cli.create_tenant(
@@ -1436,7 +1436,7 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
 
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_configs()
-    neon_env_builder.start()
+    env.start()
 
     tenant_id = TenantId.generate()
     timeline_id = TimelineId.generate()
@@ -1475,7 +1475,7 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder):
     """
 
     env = neon_env_builder.init_configs()
-    neon_env_builder.start()
+    env.start()
 
     tenants = []
     n_tenants = 8

From 9e3ead3689b012d344afbac4fcbf000372bb9969 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 11 Sep 2024 18:43:42 +0100
Subject: [PATCH 130/159] Collect the last of on-demand WAL download in
 CreateReplicationSlot reverts

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 49d5e576a5..6f6d77fb59 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e
+Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 6e9a4ff624..0baa7346df 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b
+Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e52576e61f..3a65a507f3 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,11 +1,11 @@
 {
   "v16": [
     "16.4",
-    "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b"
+    "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"
   ],
   "v15": [
     "15.8",
-    "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e"
+    "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9"
   ],
   "v14": [
     "14.13",

From fcab61bdcd9e30f2e2f6ce5be59e34bb98068f2f Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Thu, 12 Sep 2024 15:55:12 +0100
Subject: [PATCH 131/159] Prototype implementation for private access poc
 (#8976)

## Problem
For the Private Access POC we want users to be able to disable access
from the public proxy. To limit the number of changes this can be done
by configuring an IP allowlist [ "255.255.255.255" ]. For the Private
Access proxy a new commandline flag allows to disable IP allowlist
completely.

See
https://www.notion.so/neondatabase/Neon-Private-Access-POC-Proposal-8f707754e1ab4190ad5709da7832f020?d=887495c15e884aa4973f973a8a0a582a#7ac6ec249b524a74adbeddc4b84b8f5f
for details about the POC.,

## Summary of changes
- Adding the commandline flag is_private_access_proxy=true will disable
IP allowlist
---
 proxy/src/auth/backend.rs       |  5 ++++-
 proxy/src/auth/credentials.rs   | 13 +++++++++++++
 proxy/src/bin/local_proxy.rs    |  1 +
 proxy/src/bin/proxy.rs          |  5 +++++
 proxy/src/config.rs             |  1 +
 proxy/src/serverless/backend.rs |  4 +++-
 6 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 1d28c6df31..5561c9c56d 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -311,7 +311,9 @@ async fn auth_quirks(
     let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+    if config.ip_allowlist_check_enabled
+        && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
+    {
         return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
     }
 
@@ -603,6 +605,7 @@ mod tests {
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
         rate_limit_ip_subnet: 64,
+        ip_allowlist_check_enabled: true,
     });
 
     async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 0e91ae570a..cba8601d14 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -538,4 +538,17 @@ mod tests {
         ));
         Ok(())
     }
+
+    #[test]
+    fn test_connection_blocker() {
+        fn check(v: serde_json::Value) -> bool {
+            let peer_addr = IpAddr::from([127, 0, 0, 1]);
+            let ip_list: Vec<IpPattern> = serde_json::from_value(v).unwrap();
+            check_peer_addr_is_in_list(&peer_addr, &ip_list)
+        }
+
+        assert!(check(json!([])));
+        assert!(check(json!(["127.0.0.1"])));
+        assert!(!check(json!(["255.255.255.255"])));
+    }
 }
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 08effeff99..6eba71df1b 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -224,6 +224,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
             rate_limiter_enabled: false,
             rate_limiter: BucketRateLimiter::new(vec![]),
             rate_limit_ip_subnet: 64,
+            ip_allowlist_check_enabled: true,
         },
         require_client_ip: false,
         handshake_timeout: Duration::from_secs(10),
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7706a1f7cd..ca9aeb04d8 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -224,6 +224,10 @@ struct ProxyCliArgs {
     /// Whether to retry the wake_compute request
     #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
     wake_compute_retry: String,
+
+    /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_private_access_proxy: bool,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -682,6 +686,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
+        ip_allowlist_check_enabled: !args.is_private_access_proxy,
     };
 
     let config = Box::leak(Box::new(ProxyConfig {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index d7fc6eee22..1cda6d200c 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -64,6 +64,7 @@ pub struct AuthenticationConfig {
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
     pub rate_limit_ip_subnet: u8,
+    pub ip_allowlist_check_enabled: bool,
 }
 
 impl TlsConfig {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index f24e0478be..d163878528 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -50,7 +50,9 @@ impl PoolingBackend {
             .as_ref()
             .map(|()| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+        if config.ip_allowlist_check_enabled
+            && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
+        {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
         if !self

From 78938d1b591b33d23495a0edb8b123cc5cac6a27 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Thu, 12 Sep 2024 23:18:41 +0100
Subject: [PATCH 132/159] [compute/postgres] feature: PostgreSQL 17 (#8573)

This adds preliminary PG17 support to Neon, based on RC1 / 2024-09-04
https://github.com/postgres/postgres/commit/07b828e9d4aa916f1763774787440d914eea69c4

NOTICE: The data produced by the included version of the PostgreSQL fork
may not be compatible with the future full release of PostgreSQL 17 due to
expected or unexpected future changes in magic numbers and internals.
DO NOT EXPECT DATA IN V17-TENANTS TO BE COMPATIBLE WITH THE 17.0
RELEASE!

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml |   19 +-
 .github/workflows/build_and_test.yml          |   27 +-
 .github/workflows/neon_extra_builds.yml       |   15 +
 .gitmodules                                   |    4 +
 Dockerfile                                    |   12 +-
 Dockerfile.compute-node                       |  289 +++-
 Makefile                                      |   56 +-
 compute_tools/src/compute.rs                  |   31 +-
 compute_tools/src/extension_server.rs         |    1 +
 control_plane/src/local_env.rs                |    2 +-
 control_plane/src/storage_controller.rs       |   40 +-
 libs/pageserver_api/src/key.rs                |   39 +-
 libs/postgres_ffi/build.rs                    |    2 +-
 libs/postgres_ffi/src/lib.rs                  |    5 +
 libs/postgres_ffi/src/pg_constants.rs         |   39 +-
 libs/postgres_ffi/src/pg_constants_v14.rs     |   27 +
 libs/postgres_ffi/src/pg_constants_v15.rs     |    2 +
 libs/postgres_ffi/src/pg_constants_v16.rs     |    2 +
 libs/postgres_ffi/src/pg_constants_v17.rs     |   55 +
 libs/postgres_ffi/wal_craft/src/lib.rs        |    2 +-
 libs/walproposer/build.rs                     |    9 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |   16 +-
 pageserver/src/basebackup.rs                  |   16 +-
 pageserver/src/config.rs                      |    2 +-
 pageserver/src/import_datadir.rs              |    6 +-
 pageserver/src/pgdatadir_mapping.rs           |  111 +-
 pageserver/src/walingest.rs                   |  191 ++-
 pageserver/src/walrecord.rs                   |   73 +-
 pgxn/neon/bitmap.h                            |   12 +
 pgxn/neon/file_cache.c                        |  504 ++++--
 pgxn/neon/libpagestore.c                      |    4 +
 pgxn/neon/neon_pgversioncompat.h              |   14 +-
 pgxn/neon/pagestore_client.h                  |   54 +-
 pgxn/neon/pagestore_smgr.c                    | 1360 ++++++++++++-----
 pgxn/neon/walproposer_pg.c                    |   39 +-
 pgxn/neon_rmgr/neon_rmgr_decode.c             |  399 ++++-
 pgxn/neon_walredo/inmem_smgr.c                |   79 +-
 pgxn/neon_walredo/inmem_smgr.h                |    2 +-
 pgxn/neon_walredo/walredoproc.c               |   14 +-
 test_runner/fixtures/common_types.py          |    2 +-
 test_runner/fixtures/neon_fixtures.py         |    9 +-
 test_runner/fixtures/pg_version.py            |    1 +
 .../5670669815/v17/ext_index.json             |    7 +
 test_runner/regress/test_compatibility.py     |    8 +-
 .../regress/test_download_extensions.py       |    2 +
 test_runner/regress/test_postgres_version.py  |   17 +-
 .../regress/test_timeline_detach_ancestor.py  |    3 +
 test_runner/regress/test_twophase.py          |   70 +-
 vendor/postgres-v17                           |    1 +
 49 files changed, 2907 insertions(+), 787 deletions(-)
 create mode 100644 libs/postgres_ffi/src/pg_constants_v17.rs
 create mode 100644 pgxn/neon/bitmap.h
 create mode 100644 test_runner/regress/data/extension_test/5670669815/v17/ext_index.json
 create mode 160000 vendor/postgres-v17

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index e18e6a1201..67152b6991 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -62,7 +62,7 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
+          for r in 14 15 16 17; do
             git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
             git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
           done
@@ -83,6 +83,10 @@ jobs:
         id: pg_v16_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
 
+      - name: Set pg 17 revision for caching
+        id: pg_v17_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
+
       # Set some environment variables used by all the steps.
       #
       # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -136,6 +140,13 @@ jobs:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
+      - name: Cache postgres v17 build
+        id: cache_pg_17
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v17
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
         run: mold -run make postgres-v14 -j$(nproc)
@@ -148,6 +159,10 @@ jobs:
         if: steps.cache_pg_16.outputs.cache-hit != 'true'
         run: mold -run make postgres-v16 -j$(nproc)
 
+      - name: Build postgres v17
+        if: steps.cache_pg_17.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v17 -j$(nproc)
+
       - name: Build neon extensions
         run: mold -run make neon-pg-ext -j$(nproc)
 
@@ -210,7 +225,7 @@ jobs:
         run: |
           PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
           export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
           #nextest does not yet support running doctests
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 4bb9e5cb66..7c06fd9ab8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -211,7 +211,7 @@ jobs:
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
       # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
+      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -548,7 +548,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15, v16, v17 ]
         arch: [ x64, arm64 ]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -627,7 +627,7 @@ jobs:
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
-        if: matrix.version == 'v16'
+        if: matrix.version == 'v17'
         uses: docker/build-push-action@v6
         with:
           target: compute-tools-image
@@ -649,7 +649,7 @@ jobs:
 
     strategy:
       matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15, v16, v17 ]
 
     steps:
       - uses: docker/login-action@v3
@@ -671,7 +671,7 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
 
       - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v16'
+        if: matrix.version == 'v17'
         run: |
           docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                              neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
@@ -689,7 +689,7 @@ jobs:
                                                                                 neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v16'
+        if: matrix.version == 'v17'
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                                                                 neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
@@ -700,7 +700,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15, v16, v17 ]
     env:
       VM_BUILDER_VERSION: v0.29.3
 
@@ -798,7 +798,7 @@ jobs:
     runs-on: ubuntu-22.04
 
     env:
-      VERSIONS: v14 v15 v16
+      VERSIONS: v14 v15 v16 v17
 
     steps:
       - uses: docker/login-action@v3
@@ -839,7 +839,7 @@ jobs:
             done
           done
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
-                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
+                                              neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
       - name: Login to prod ECR
         uses: docker/login-action@v3
@@ -852,7 +852,7 @@ jobs:
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
@@ -864,7 +864,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -876,7 +876,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -971,7 +971,7 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
+          for r in 14 15 16 17; do
             git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
             git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
           done
@@ -1117,6 +1117,7 @@ jobs:
 
               files_to_promote+=("s3://${BUCKET}/${s3_key}")
 
+              # TODO Add v17
               for pg_version in v14 v15 v16; do
                 # We run less tests for debug builds, so we don't need to promote them
                 if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 7fecdbde8c..41c9f5dee5 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -72,6 +72,10 @@ jobs:
         id: pg_v16_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
 
+      - name: Set pg 17 revision for caching
+        id: pg_v17_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
+
       - name: Cache postgres v14 build
         id: cache_pg_14
         uses: actions/cache@v4
@@ -93,6 +97,13 @@ jobs:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
+      - name: Cache postgres v17 build
+        id: cache_pg_17
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v17
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
       - name: Set extra env for macOS
         run: |
           echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -120,6 +131,10 @@ jobs:
         if: steps.cache_pg_16.outputs.cache-hit != 'true'
         run: make postgres-v16 -j$(sysctl -n hw.ncpu)
 
+      - name: Build postgres v17
+        if: steps.cache_pg_17.outputs.cache-hit != 'true'
+        run: make postgres-v17 -j$(sysctl -n hw.ncpu)
+
       - name: Build neon extensions
         run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
 
diff --git a/.gitmodules b/.gitmodules
index 1d925674a1..d1330bf28c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,7 @@
 	path = vendor/postgres-v16
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_16_STABLE_neon
+[submodule "vendor/postgres-v17"]
+	path = vendor/postgres-v17
+	url = https://github.com/neondatabase/postgres.git
+	branch = REL_17_STABLE_neon
diff --git a/Dockerfile b/Dockerfile
index 1efedfa9bc..bdb76a4f4f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,6 +5,8 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
+ARG DEFAULT_PG_VERSION=17
+ARG STABLE_PG_VERSION=16
 
 # Build Postgres
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
@@ -13,6 +15,7 @@ WORKDIR /home/nonroot
 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
+COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -28,16 +31,19 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
+ARG STABLE_PG_VERSION
 
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
+COPY --from=pg-build /home/nonroot/pg_install/v17/lib                       pg_install/v17/lib
 COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
@@ -52,6 +58,7 @@ RUN set -e \
 # Build final image
 #
 FROM debian:bullseye-slim
+ARG DEFAULT_PG_VERSION
 WORKDIR /data
 
 RUN set -e \
@@ -77,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
 COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
+COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
 
 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
@@ -93,7 +101,7 @@ RUN mkdir -p /data/.neon/ && \
 
 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH=/usr/local/v16/lib
+ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
 
 
 VOLUME ["/data"]
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index b6c89cd71f..fe902eb978 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -79,15 +79,23 @@ RUN cd postgres && \
 #
 #########################################################################################
 FROM build-deps AS postgis-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt update && \
     apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
     libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
     protobuf-c-compiler xsltproc
 
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
-RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+        mkdir -p /sfcgal && \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
     echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
     mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
     cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -96,7 +104,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
 
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -122,7 +133,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg
     cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
     cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
 
-RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
     echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
     mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
@@ -142,12 +156,19 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 #
 #########################################################################################
 FROM build-deps AS plv8-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN apt update && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
     echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
     mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
     # generate and copy upgrade scripts
@@ -172,9 +193,13 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.t
 #
 #########################################################################################
 FROM build-deps AS h3-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "$(uname -m)" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "$(uname -m)" in \
       "x86_64") \
         export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
         ;; \
@@ -192,7 +217,11 @@ RUN case "$(uname -m)" in \
       && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
       && rm /tmp/cmake-install.sh
 
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+        mkdir -p /h3/usr/ && \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
     mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
@@ -202,7 +231,10 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
     echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
@@ -218,9 +250,13 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 #
 #########################################################################################
 FROM build-deps AS unit-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
     echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
     mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -239,6 +275,7 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 #
 #########################################################################################
 FROM build-deps AS vector-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/pgvector.patch /pgvector.patch
@@ -246,7 +283,10 @@ COPY patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
     echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
@@ -261,10 +301,14 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O
 #
 #########################################################################################
 FROM build-deps AS pgjwt-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
-RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
     echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
     mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -277,9 +321,13 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 #
 #########################################################################################
 FROM build-deps AS hypopg-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
     echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -293,9 +341,13 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypo
 #
 #########################################################################################
 FROM build-deps AS pg-hashids-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
     mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -309,11 +361,15 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 #
 #########################################################################################
 FROM build-deps AS rum-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/rum.patch /rum.patch
 
-RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
     echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     patch -p1 < /rum.patch && \
@@ -328,9 +384,13 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r
 #
 #########################################################################################
 FROM build-deps AS pgtap-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
     echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
     mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -344,9 +404,13 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 #
 #########################################################################################
 FROM build-deps AS ip4r-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -360,9 +424,13 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 #
 #########################################################################################
 FROM build-deps AS prefix-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -376,9 +444,13 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 #
 #########################################################################################
 FROM build-deps AS hll-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -392,9 +464,13 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 #
 #########################################################################################
 FROM build-deps AS plpgsql-check-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
     echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -413,7 +489,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "${PG_VERSION}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
         export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
@@ -446,7 +525,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "${PG_VERSION}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
         export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -459,6 +541,9 @@ RUN case "${PG_VERSION}" in \
         export PG_HINT_PLAN_VERSION=16_1_6_0 \
         export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
         ;; \
+      "v17") \
+        echo "TODO: PG17 pg_hint_plan support" && exit 0 \
+        ;; \
       *) \
         echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
         ;; \
@@ -478,10 +563,14 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 FROM build-deps AS pg-cron-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
     echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -495,9 +584,13 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O
 #
 #########################################################################################
 FROM build-deps AS rdkit-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN apt-get update && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt-get update && \
     apt-get install -y \
         cmake \
         libboost-iostreams1.74-dev \
@@ -507,7 +600,10 @@ RUN apt-get update && \
         libeigen3-dev
 
 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
     echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
@@ -544,10 +640,14 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 #
 #########################################################################################
 FROM build-deps AS pg-uuidv7-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -561,10 +661,14 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 #
 #########################################################################################
 FROM build-deps AS pg-roaringbitmap-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -578,10 +682,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 #
 #########################################################################################
 FROM build-deps AS pg-semver-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
     mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -599,7 +707,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
         export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
@@ -620,10 +731,14 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 FROM build-deps AS pg-anon-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -641,6 +756,7 @@ RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tag
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt-get update && \
@@ -651,9 +767,11 @@ ENV HOME=/home/nonroot
 ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
-ARG PG_VERSION
 
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
@@ -672,7 +790,10 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
     echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
@@ -694,7 +815,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
     echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -714,7 +838,10 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION
 
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     # TODO update pgrx version in the pg_tiktoken repo and remove this line
@@ -733,7 +860,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
     echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -748,10 +878,14 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 #########################################################################################
 
 FROM build-deps AS wal2json-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -764,10 +898,14 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 #
 #########################################################################################
 FROM build-deps AS pg-ivm-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -781,10 +919,14 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 #
 #########################################################################################
 FROM build-deps AS pg-partman-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -835,7 +977,10 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon \
         -s install && \
@@ -854,8 +999,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
     case "${PG_VERSION}" in \
         "v14" | "v15") \
         ;; \
-        "v16") \
-            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
+        "v16" | "v17") \
+            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
         ;; \
         *) \
             echo "unexpected PostgreSQL version" && exit 1 \
@@ -878,7 +1023,10 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
 
 #########################################################################################
 #
@@ -899,15 +1047,24 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
 
 # Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    rm -r /usr/local/pgsql/include
 
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    rm /usr/local/pgsql/lib/lib*.a
 
 
 #########################################################################################
@@ -918,7 +1075,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 
 FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
-RUN mkdir /ext-src
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    mkdir /ext-src
 
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
@@ -956,18 +1116,39 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
 COPY patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN cd /ext-src/ && for f in *.tar.gz; \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/ && for f in *.tar.gz; \
     do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/rum-src && patch -p1 <../rum.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 # cmake is required for the h3 test
-RUN apt-get update && apt-get install -y cmake
-RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt-get update && apt-get install -y cmake
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN patch -p1 </ext-src/pg_anon.patch
-RUN patch -p1 </ext-src/pg_cron.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    patch -p1 </ext-src/pg_anon.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
diff --git a/Makefile b/Makefile
index de298303e3..b9bb1c147d 100644
--- a/Makefile
+++ b/Makefile
@@ -119,6 +119,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
+.PHONY: postgres-configure-v17
+postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status
 .PHONY: postgres-configure-v16
 postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
@@ -215,29 +217,31 @@ neon-pg-clean-ext-%:
 # they depend on openssl and other libraries that are not included in our
 # Rust build.
 .PHONY: walproposer-lib
-walproposer-lib: neon-pg-ext-v16
+walproposer-lib: neon-pg-ext-v17
 	+@echo "Compiling walproposer-lib"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-ifeq ($(UNAME_S),Linux)
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
 		pg_strong_random.o
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
-		pg_crc32c.o \
-		hmac_openssl.o \
+		checksum_helper.o \
 		cryptohash_openssl.o \
-		scram-common.o \
+		hmac_openssl.o \
 		md5_common.o \
-		checksum_helper.o
+		parse_manifest.o \
+		scram-common.o
+ifeq ($(UNAME_S),Linux)
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+		pg_crc32c.o
 endif
 
 .PHONY: walproposer-lib-clean
 walproposer-lib-clean:
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \
 		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
 
@@ -245,38 +249,44 @@ walproposer-lib-clean:
 neon-pg-ext: \
 	neon-pg-ext-v14 \
 	neon-pg-ext-v15 \
-	neon-pg-ext-v16
+	neon-pg-ext-v16 \
+	neon-pg-ext-v17
 
 .PHONY: neon-pg-clean-ext
 neon-pg-clean-ext: \
 	neon-pg-clean-ext-v14 \
 	neon-pg-clean-ext-v15 \
-	neon-pg-clean-ext-v16
+	neon-pg-clean-ext-v16 \
+	neon-pg-clean-ext-v17
 
 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
 	postgres-v15 \
-	postgres-v16
+	postgres-v16 \
+	postgres-v17
 
 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
 	postgres-headers-v15 \
-	postgres-headers-v16
+	postgres-headers-v16 \
+	postgres-headers-v17
 
 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
 	postgres-clean-v15 \
-	postgres-clean-v16
+	postgres-clean-v16 \
+	postgres-clean-v17
 
 .PHONY: postgres-check
 postgres-check: \
 	postgres-check-v14 \
 	postgres-check-v15 \
-	postgres-check-v16
+	postgres-check-v16 \
+	postgres-check-v17
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
@@ -321,13 +331,13 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 	rm -f pg*.BAK
 
 # Indent pxgn/neon.
-.PHONY: pgindent
-neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
-		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
-		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
+.PHONY: neon-pgindent
+neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
+		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
 
 
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5bd6897fe3..1f47bb58a3 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1052,26 +1052,19 @@ impl ComputeNode {
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
 
         let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary {
-            if !pspec.spec.skip_pg_catalog_updates {
-                let pgdata_path = Path::new(&self.pgdata);
-                // temporarily reset max_cluster_size in config
-                // to avoid the possibility of hitting the limit, while we are applying config:
-                // creating new extensions, roles, etc...
-                config::with_compute_ctl_tmp_override(
-                    pgdata_path,
-                    "neon.max_cluster_size=-1",
-                    || {
-                        self.pg_reload_conf()?;
-
-                        self.apply_config(&compute_state)?;
-
-                        Ok(())
-                    },
-                )?;
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
                 self.pg_reload_conf()?;
-            }
-            self.post_apply_config()?;
+
+                self.apply_config(&compute_state)?;
+
+                Ok(())
+            })?;
+            self.pg_reload_conf()?;
         }
 
         let startup_end_time = Utc::now();
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index ef1db73982..6ef7e0837f 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -124,6 +124,7 @@ fn parse_pg_version(human_version: &str) -> &str {
             "14" => return "v14",
             "15" => return "v15",
             "16" => return "v16",
+            "17" => return "v17",
             _ => {}
         },
         _ => {}
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 5dbc3bcbbc..d616154af6 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -342,7 +342,7 @@ impl LocalEnv {
 
         #[allow(clippy::manual_range_patterns)]
         match pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
             _ => bail!("Unsupported postgres version: {}", pg_version),
         }
     }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index c715d6b789..2b714fbfbf 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -28,6 +28,7 @@ use utils::{
     auth::{encode_from_key_file, Claims, Scope},
     id::{NodeId, TenantId},
 };
+use whoami::username;
 
 pub struct StorageController {
     env: LocalEnv,
@@ -183,7 +184,7 @@ impl StorageController {
     /// to other versions if that one isn't found.  Some automated tests create circumstances
     /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
     async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];
 
         for v in prefer_versions {
             let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
@@ -211,7 +212,16 @@ impl StorageController {
     /// Readiness check for our postgres process
     async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
+        let args = [
+            "-h",
+            "localhost",
+            "-U",
+            &username(),
+            "-d",
+            DB_NAME,
+            "-p",
+            &format!("{}", postgres_port),
+        ];
         let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
 
         Ok(exitcode.success())
@@ -225,7 +235,11 @@ impl StorageController {
     ///
     /// Returns the database url
     pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
+        let database_url = format!(
+            "postgresql://{}@localhost:{}/{DB_NAME}",
+            &username(),
+            postgres_port
+        );
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let createdb_path = pg_bin_dir.join("createdb");
@@ -235,6 +249,10 @@ impl StorageController {
                 "localhost",
                 "-p",
                 &format!("{}", postgres_port),
+                "-U",
+                &username(),
+                "-O",
+                &username(),
                 DB_NAME,
             ])
             .output()
@@ -271,7 +289,7 @@ impl StorageController {
             // But tokio-postgres fork doesn't have this upstream commit:
             // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
             // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
-            .user(&whoami::username())
+            .user(&username())
             .dbname(DB_NAME)
             .connect(tokio_postgres::NoTls)
             .await
@@ -328,6 +346,12 @@ impl StorageController {
             let pg_log_path = pg_data_path.join("postgres.log");
 
             if !tokio::fs::try_exists(&pg_data_path).await? {
+                let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
+                tracing::info!(
+                    "Initializing storage controller database with args: {:?}",
+                    initdb_args
+                );
+
                 // Initialize empty database
                 let initdb_path = pg_bin_dir.join("initdb");
                 let mut child = Command::new(&initdb_path)
@@ -335,7 +359,7 @@ impl StorageController {
                         ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                         ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                     ])
-                    .args(["-D", pg_data_path.as_ref()])
+                    .args(initdb_args)
                     .spawn()
                     .expect("Failed to spawn initdb");
                 let status = child.wait().await?;
@@ -364,8 +388,14 @@ impl StorageController {
                 pg_data_path.as_ref(),
                 "-l",
                 pg_log_path.as_ref(),
+                "-U",
+                &username(),
                 "start",
             ];
+            tracing::info!(
+                "Starting storage controller database with args: {:?}",
+                db_start_args
+            );
 
             background_process::start_process(
                 "storage_controller_db",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 8929ccb41d..4a776709c9 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,8 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::Oid;
 use postgres_ffi::RepOriginId;
-use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
 
@@ -350,7 +350,17 @@ impl Key {
 // 02 00000000 00000000 00000000 00   00000000
 //
 // TwoPhaseFile:
-// 02 00000000 00000000 00000000 00   XID
+//
+// 02 00000000 00000000 00XXXXXX XX   XXXXXXXX
+//
+//                        \______XID_________/
+//
+// The 64-bit XID is stored a little awkwardly in field6, field5 and
+// field4. PostgreSQL v16 and below only stored a 32-bit XID, which
+// fit completely in field6, but starting with PostgreSQL v17, a full
+// 64-bit XID is used. Most pageserver code that accesses
+// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits
+// are just unused.
 //
 // ControlFile:
 // 03 00000000 00000000 00000000 00   00000000
@@ -582,35 +592,36 @@ pub const TWOPHASEDIR_KEY: Key = Key {
 };
 
 #[inline(always)]
-pub fn twophase_file_key(xid: TransactionId) -> Key {
+pub fn twophase_file_key(xid: u64) -> Key {
     Key {
         field1: 0x02,
         field2: 0,
         field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
+        field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
+        field5: ((xid & 0x000000FF00000000) >> 32) as u8,
+        field6: (xid & 0x00000000FFFFFFFF) as u32,
     }
 }
 
 #[inline(always)]
-pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
+pub fn twophase_key_range(xid: u64) -> Range<Key> {
+    // 64-bit XIDs really should not overflow
     let (next_xid, overflowed) = xid.overflowing_add(1);
 
     Key {
         field1: 0x02,
         field2: 0,
         field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
+        field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
+        field5: ((xid & 0x000000FF00000000) >> 32) as u8,
+        field6: (xid & 0x00000000FFFFFFFF) as u32,
     }..Key {
         field1: 0x02,
         field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: u8::from(overflowed),
-        field6: next_xid,
+        field3: u32::from(overflowed),
+        field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32,
+        field5: ((next_xid & 0x000000FF00000000) >> 32) as u8,
+        field6: (next_xid & 0x00000000FFFFFFFF) as u32,
     }
 }
 
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index a346390f3d..d3a85f2683 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
         PathBuf::from("pg_install")
     };
 
-    for pg_version in &["v14", "v15", "v16"] {
+    for pg_version in &["v14", "v15", "v16", "v17"] {
         let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
         if pg_install_dir_versioned.is_relative() {
             let cwd = env::current_dir().context("Failed to get current_dir")?;
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index f18e0c603b..0d46ed6aac 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -57,6 +57,7 @@ macro_rules! for_all_postgres_versions {
         $macro!(v14);
         $macro!(v15);
         $macro!(v16);
+        $macro!(v17);
     };
 }
 
@@ -91,6 +92,7 @@ macro_rules! dispatch_pgversion {
                 14 : v14,
                 15 : v15,
                 16 : v16,
+                17 : v17,
             ]
         )
     };
@@ -121,6 +123,7 @@ macro_rules! enum_pgversion_dispatch {
                 V14 : v14,
                 V15 : v15,
                 V16 : v16,
+                V17 : v17,
             ]
         )
     };
@@ -150,6 +153,7 @@ macro_rules! enum_pgversion {
                 V14 : v14,
                 V15 : v15,
                 V16 : v16,
+                V17 : v17,
             ]
         }
     };
@@ -162,6 +166,7 @@ macro_rules! enum_pgversion {
                 V14 : v14,
                 V15 : v15,
                 V16 : v16,
+                V17 : v17,
             ]
         }
     };
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 6ce855c78e..61b49a634d 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -152,6 +152,9 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 
+// From heapam_xlog.h
+pub const XLOG_HEAP2_REWRITE: u8 = 0x00;
+
 // From replication/message.h
 pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
 
@@ -219,15 +222,20 @@ pub const INVALID_TRANSACTION_ID: u32 = 0;
 pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
 pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
 
+/* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
-pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
-pub const XLP_LONG_HEADER: u16 = 0x0002;
+pub const XLOG_PARAMETER_CHANGE: u8 = 0x60;
+pub const XLOG_END_OF_RECOVERY: u8 = 0x90;
 
 /* From xlog.h */
 pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
 pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
 
+/* xlog_internal.h */
+pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
+pub const XLP_LONG_HEADER: u16 = 0x0002;
+
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
    + 64 /* NameData */  + 4*4;
@@ -245,33 +253,6 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
 /* From origin.c */
 pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
 
-// List of subdirectories inside pgdata.
-// Copied from src/bin/initdb/initdb.c
-pub const PGDATA_SUBDIRS: [&str; 22] = [
-    "global",
-    "pg_wal/archive_status",
-    "pg_commit_ts",
-    "pg_dynshmem",
-    "pg_notify",
-    "pg_serial",
-    "pg_snapshots",
-    "pg_subtrans",
-    "pg_twophase",
-    "pg_multixact",
-    "pg_multixact/members",
-    "pg_multixact/offsets",
-    "base",
-    "base/1",
-    "pg_replslot",
-    "pg_tblspc",
-    "pg_stat",
-    "pg_stat_tmp",
-    "pg_xact",
-    "pg_logical",
-    "pg_logical/snapshots",
-    "pg_logical/mappings",
-];
-
 // Don't include postgresql.conf as it is inconvenient on node start:
 // we need postgresql.conf before basebackup to synchronize safekeepers
 // so no point in overwriting it during backup restore. Rest of the files
diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs
index 32f8f51114..fe01a5df7c 100644
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -5,6 +5,33 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
 pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
 
+// List of subdirectories inside pgdata.
+// Copied from src/bin/initdb/initdb.c
+pub const PGDATA_SUBDIRS: [&str; 22] = [
+    "global",
+    "pg_wal/archive_status",
+    "pg_commit_ts",
+    "pg_dynshmem",
+    "pg_notify",
+    "pg_serial",
+    "pg_snapshots",
+    "pg_subtrans",
+    "pg_twophase",
+    "pg_multixact",
+    "pg_multixact/members",
+    "pg_multixact/offsets",
+    "base",
+    "base/1",
+    "pg_replslot",
+    "pg_tblspc",
+    "pg_stat",
+    "pg_stat_tmp",
+    "pg_xact",
+    "pg_logical",
+    "pg_logical/snapshots",
+    "pg_logical/mappings",
+];
+
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
     (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
 }
diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs
index 626a23c7ea..3cd1b7aec5 100644
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
 
 pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
 
+pub use super::super::v14::bindings::PGDATA_SUBDIRS;
+
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
     const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
 
diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs
index 587be71cb3..31bd5b68fd 100644
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
 
 pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
 
+pub use super::super::v14::bindings::PGDATA_SUBDIRS;
+
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
     const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
 
diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs
new file mode 100644
index 0000000000..2132938680
--- /dev/null
+++ b/libs/postgres_ffi/src/pg_constants_v17.rs
@@ -0,0 +1,55 @@
+pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
+
+pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
+pub const XLOG_DBASE_DROP: u8 = 0x20;
+
+pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
+pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
+
+// List of subdirectories inside pgdata.
+// Copied from src/bin/initdb/initdb.c
+pub const PGDATA_SUBDIRS: [&str; 23] = [
+    "global",
+    "pg_wal/archive_status",
+    "pg_wal/summaries",
+    "pg_commit_ts",
+    "pg_dynshmem",
+    "pg_notify",
+    "pg_serial",
+    "pg_snapshots",
+    "pg_subtrans",
+    "pg_twophase",
+    "pg_multixact",
+    "pg_multixact/members",
+    "pg_multixact/offsets",
+    "base",
+    "base/1",
+    "pg_replslot",
+    "pg_tblspc",
+    "pg_stat",
+    "pg_stat_tmp",
+    "pg_xact",
+    "pg_logical",
+    "pg_logical/snapshots",
+    "pg_logical/mappings",
+];
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
+
+
+pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10;
+pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20;
+pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30;
+
+
+pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0;
+pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0;
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 6052f04d11..949e3f4251 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -53,7 +53,7 @@ impl Conf {
 
         #[allow(clippy::manual_range_patterns)]
         match self.pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))),
             _ => bail!("Unsupported postgres version: {}", self.pg_version),
         }
     }
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 28547f52bf..3f549889b8 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -5,6 +5,8 @@ use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
 
+const WALPROPOSER_PG_VERSION: &str = "v17";
+
 fn main() -> anyhow::Result<()> {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
     println!("cargo:rerun-if-changed=bindgen_deps.h");
@@ -36,7 +38,10 @@ fn main() -> anyhow::Result<()> {
     // Rebuild crate when libwalproposer.a changes
     println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a");
 
-    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
+    let pg_config_bin = pg_install_abs
+        .join(WALPROPOSER_PG_VERSION)
+        .join("bin")
+        .join("pg_config");
     let inc_server_path: String = if pg_config_bin.exists() {
         let output = Command::new(pg_config_bin)
             .arg("--includedir-server")
@@ -53,7 +58,7 @@ fn main() -> anyhow::Result<()> {
             .into()
     } else {
         let server_path = pg_install_abs
-            .join("v16")
+            .join(WALPROPOSER_PG_VERSION)
             .join("include")
             .join("postgresql")
             .join("server")
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index a07107753e..adc090823d 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -79,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
         return None;
     }
     let keys: Vec<&str> = split[0].split('-').collect();
-    let mut lsns: Vec<&str> = split[1].split('-').collect();
-    let is_delta = if lsns.len() == 1 {
-        lsns.push(lsns[0]);
+    let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
+    let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
+    let the_lsns: [&str; 2];
+
+    /*
+     * Generations add a -vX-XXXXXX postfix, which causes issues when we try to
+     * parse 'vX' as an LSN.
+     */
+    let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
+        the_lsns = [lsns[0], lsns[0]];
         false
     } else {
+        the_lsns = [lsns[0], lsns[1]];
         true
     };
 
     let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
     let holes = Vec::new();
     Some(LayerFile {
         key_range,
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 207f781e1b..a32d09f3b3 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
-use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
+use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
-use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
@@ -255,8 +254,11 @@ where
 
         let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
 
+        let pgversion = self.timeline.pg_version;
+        let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);
+
         // Create pgdata subdirs structure
-        for dir in PGDATA_SUBDIRS.iter() {
+        for dir in subdirs.iter() {
             let header = new_tar_header_dir(dir)?;
             self.ar
                 .append(&header, &mut io::empty())
@@ -606,7 +608,7 @@ where
     //
     // Extract twophase state files
     //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
+    async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> {
         let img = self
             .timeline
             .get_twophase_file(xid, self.lsn, self.ctx)
@@ -617,7 +619,11 @@ where
         buf.extend_from_slice(&img[..]);
         let crc = crc32c::crc32c(&img[..]);
         buf.put_u32_le(crc);
-        let path = format!("pg_twophase/{:>08X}", xid);
+        let path = if self.timeline.pg_version < 17 {
+            format!("pg_twophase/{:>08X}", xid)
+        } else {
+            format!("pg_twophase/{:>016X}", xid)
+        };
         let header = new_tar_header(&path, buf.len() as u64)?;
         self.ar
             .append(&header, &buf[..])
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 29a98855d3..e9f197ec2d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -281,7 +281,7 @@ impl PageServerConf {
 
         #[allow(clippy::manual_range_patterns)]
         match pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
             _ => bail!("Unsupported postgres version: {}", pg_version),
         }
     }
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 5a0894cd1b..ca87f1d080 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -580,9 +580,11 @@ async fn import_file(
         import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
-        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
-
         let bytes = read_all_bytes(reader).await?;
+
+        // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions,
+        // it's a 32-bit TransactionId, which fits in u64 anyway.
+        let xid = u64::from_str_radix(file_name.as_ref(), 16)?;
         modification
             .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
             .await?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6dd8851b13..5f8766ca2c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -633,7 +633,7 @@ impl Timeline {
 
     pub(crate) async fn get_twophase_file(
         &self,
-        xid: TransactionId,
+        xid: u64,
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
@@ -646,11 +646,19 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<HashSet<TransactionId>, PageReconstructError> {
+    ) -> Result<HashSet<u64>, PageReconstructError> {
         // fetch directory entry
         let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
-        Ok(TwoPhaseDirectory::des(&buf)?.xids)
+        if self.pg_version >= 17 {
+            Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
+        } else {
+            Ok(TwoPhaseDirectory::des(&buf)?
+                .xids
+                .iter()
+                .map(|x| u64::from(*x))
+                .collect())
+        }
     }
 
     pub(crate) async fn get_control_file(
@@ -902,9 +910,13 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
-        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
+
+        let mut xids: Vec<u64> = self
+            .list_twophase_files(lsn, ctx)
+            .await?
+            .iter()
+            .cloned()
+            .collect();
         xids.sort_unstable();
         for xid in xids {
             result.add_key(twophase_file_key(xid));
@@ -1127,9 +1139,15 @@ impl<'a> DatadirModification<'a> {
         // Create AuxFilesDirectory
         self.init_aux_dir()?;
 
-        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
-            xids: HashSet::new(),
-        })?;
+        let buf = if self.tline.pg_version >= 17 {
+            TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
+                xids: HashSet::new(),
+            })
+        } else {
+            TwoPhaseDirectory::ser(&TwoPhaseDirectory {
+                xids: HashSet::new(),
+            })
+        }?;
         self.pending_directory_entries
             .push((DirectoryKind::TwoPhase, 0));
         self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
@@ -1321,22 +1339,31 @@ impl<'a> DatadirModification<'a> {
 
     pub async fn put_twophase_file(
         &mut self,
-        xid: TransactionId,
+        xid: u64,
         img: Bytes,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let mut dir = TwoPhaseDirectory::des(&buf)?;
-        if !dir.xids.insert(xid) {
-            anyhow::bail!("twophase file for xid {} already exists", xid);
-        }
-        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, dir.xids.len()));
-        self.put(
-            TWOPHASEDIR_KEY,
-            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
-        );
+        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
+        let newdirbuf = if self.tline.pg_version >= 17 {
+            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
+            if !dir.xids.insert(xid) {
+                anyhow::bail!("twophase file for xid {} already exists", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
+        } else {
+            let xid = xid as u32;
+            let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
+            if !dir.xids.insert(xid) {
+                anyhow::bail!("twophase file for xid {} already exists", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
+        };
+        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
 
         self.put(twophase_file_key(xid), Value::Image(img));
         Ok(())
@@ -1639,22 +1666,32 @@ impl<'a> DatadirModification<'a> {
     /// This method is used for marking truncated SLRU files
     pub async fn drop_twophase_file(
         &mut self,
-        xid: TransactionId,
+        xid: u64,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let mut dir = TwoPhaseDirectory::des(&buf)?;
+        let newdirbuf = if self.tline.pg_version >= 17 {
+            let mut dir = TwoPhaseDirectoryV17::des(&buf)?;
 
-        if !dir.xids.remove(&xid) {
-            warn!("twophase file for xid {} does not exist", xid);
-        }
-        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, dir.xids.len()));
-        self.put(
-            TWOPHASEDIR_KEY,
-            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
-        );
+            if !dir.xids.remove(&xid) {
+                warn!("twophase file for xid {} does not exist", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
+        } else {
+            let xid: u32 = u32::try_from(xid)?;
+            let mut dir = TwoPhaseDirectory::des(&buf)?;
+
+            if !dir.xids.remove(&xid) {
+                warn!("twophase file for xid {} does not exist", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
+        };
+        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
 
         // Delete it
         self.delete(twophase_key_range(xid));
@@ -2124,11 +2161,21 @@ struct DbDirectory {
     dbdirs: HashMap<(Oid, Oid), bool>,
 }
 
+// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
+// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs.  Previously, the files
+// were named like "pg_twophase/000002E5", now they're like
+// "pg_twophsae/0000000A000002E4".
+
 #[derive(Debug, Serialize, Deserialize)]
 struct TwoPhaseDirectory {
     xids: HashSet<TransactionId>,
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+struct TwoPhaseDirectoryV17 {
+    xids: HashSet<u64>,
+}
+
 #[derive(Debug, Serialize, Deserialize, Default)]
 struct RelDirectory {
     // Set of relations that exist. (relfilenode, forknum)
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 6e15ad81c3..229c01a681 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -237,6 +237,26 @@ impl WalIngest {
                                 .await?;
                         }
                     }
+                } else if pg_version == 17 {
+                    if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                        // The XLOG record was renamed between v14 and v15,
+                        // but the record format is the same.
+                        // So we can reuse XlCreateDatabase here.
+                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
+                        let createdb = XlCreateDatabase::decode(&mut buf);
+                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                            .await?;
+                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                        let dropdb = XlDropDatabase::decode(&mut buf);
+                        for tablespace_id in dropdb.tablespace_ids {
+                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
+                            modification
+                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                                .await?;
+                        }
+                    }
                 }
             }
             pg_constants::RM_TBLSPC_ID => {
@@ -246,7 +266,11 @@ impl WalIngest {
                 let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
 
                 if info == pg_constants::CLOG_ZEROPAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                     let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                     self.put_slru_page_image(
@@ -260,7 +284,7 @@ impl WalIngest {
                     .await?;
                 } else {
                     assert!(info == pg_constants::CLOG_TRUNCATE);
-                    let xlrec = XlClogTruncate::decode(&mut buf);
+                    let xlrec = XlClogTruncate::decode(&mut buf, pg_version);
                     self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                         .await?;
                 }
@@ -299,12 +323,21 @@ impl WalIngest {
                         parsed_xact.xid,
                         lsn,
                     );
-                    modification
-                        .drop_twophase_file(parsed_xact.xid, ctx)
-                        .await?;
+
+                    let xid: u64 = if pg_version >= 17 {
+                        self.adjust_to_full_transaction_id(parsed_xact.xid)?
+                    } else {
+                        parsed_xact.xid as u64
+                    };
+                    modification.drop_twophase_file(xid, ctx).await?;
                 } else if info == pg_constants::XLOG_XACT_PREPARE {
+                    let xid: u64 = if pg_version >= 17 {
+                        self.adjust_to_full_transaction_id(decoded.xl_xid)?
+                    } else {
+                        decoded.xl_xid as u64
+                    };
                     modification
-                        .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
+                        .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx)
                         .await?;
                 }
             }
@@ -312,7 +345,11 @@ impl WalIngest {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
                 if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                     let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                     self.put_slru_page_image(
@@ -325,7 +362,11 @@ impl WalIngest {
                     )
                     .await?;
                 } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                     let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                     self.put_slru_page_image(
@@ -354,6 +395,20 @@ impl WalIngest {
             pg_constants::RM_XLOG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
+                if info == pg_constants::XLOG_PARAMETER_CHANGE {
+                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                        let rec = v17::XlParameterChange::decode(&mut buf);
+                        cp.wal_level = rec.wal_level;
+                        self.checkpoint_modified = true;
+                    }
+                } else if info == pg_constants::XLOG_END_OF_RECOVERY {
+                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                        let rec = v17::XlEndOfRecovery::decode(&mut buf);
+                        cp.wal_level = rec.wal_level;
+                        self.checkpoint_modified = true;
+                    }
+                }
+
                 enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
                     if info == pg_constants::XLOG_NEXTOID {
                         let next_oid = buf.get_u32_le();
@@ -397,12 +452,24 @@ impl WalIngest {
                         if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
                             && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
                         {
-                            let mut oldest_active_xid = cp.nextXid.value as u32;
-                            for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                                if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                    oldest_active_xid = xid;
+                            let oldest_active_xid = if pg_version >= 17 {
+                                let mut oldest_active_full_xid = cp.nextXid.value;
+                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                    if xid < oldest_active_full_xid {
+                                        oldest_active_full_xid = xid;
+                                    }
                                 }
-                            }
+                                oldest_active_full_xid as u32
+                            } else {
+                                let mut oldest_active_xid = cp.nextXid.value as u32;
+                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                    let narrow_xid = xid as u32;
+                                    if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                        oldest_active_xid = narrow_xid;
+                                    }
+                                }
+                                oldest_active_xid
+                            };
                             cp.oldestActiveXid = oldest_active_xid;
                         } else {
                             cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
@@ -515,6 +582,25 @@ impl WalIngest {
         Ok(modification.len() > prev_len)
     }
 
+    /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+        let next_full_xid =
+            enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
+
+        let next_xid = (next_full_xid) as u32;
+        let mut epoch = (next_full_xid >> 32) as u32;
+
+        if xid > next_xid {
+            // Wraparound occurred, must be from a prev epoch.
+            if epoch == 0 {
+                bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}");
+            }
+            epoch -= 1;
+        }
+
+        Ok((epoch as u64) << 32 | xid as u64)
+    }
+
     /// Do not store this block, but observe it for the purposes of updating our relation size state.
     async fn observe_decoded_block(
         &mut self,
@@ -815,6 +901,73 @@ impl WalIngest {
                     bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                 }
             }
+            17 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v17::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v17::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v17::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v17::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
             _ => {}
         }
 
@@ -923,26 +1076,26 @@ impl WalIngest {
         assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
 
         match pg_version {
-            16 => {
+            16 | 17 => {
                 let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
 
                 match info {
                     pg_constants::XLOG_NEON_HEAP_INSERT => {
-                        let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
                         assert_eq!(0, buf.remaining());
                         if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                             new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_DELETE => {
-                        let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
                         if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                             new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_UPDATE
                     | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
-                        let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
                         // the size of tuple data is inferred from the size of the record.
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
@@ -958,7 +1111,7 @@ impl WalIngest {
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
-                        let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
 
                         let offset_array_len =
                             if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
@@ -974,7 +1127,7 @@ impl WalIngest {
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
                         if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
                             old_heap_blkno = Some(decoded.blocks[0].blkno);
                             flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 0c4d575de8..dd199e2c55 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -174,6 +174,7 @@ impl DecodedWALRecord {
                 }
                 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                 _ => {
                     panic!("Unsupported postgres version {pg_version}")
                 }
@@ -341,16 +342,47 @@ pub mod v14 {
             }
         }
     }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlParameterChange {
+        pub max_connections: i32,
+        pub max_worker_processes: i32,
+        pub max_wal_senders: i32,
+        pub max_prepared_xacts: i32,
+        pub max_locks_per_xact: i32,
+        pub wal_level: i32,
+        pub wal_log_hints: bool,
+        pub track_commit_timestamp: bool,
+        pub _padding: [u8; 2],
+    }
+
+    impl XlParameterChange {
+        pub fn decode(buf: &mut Bytes) -> XlParameterChange {
+            XlParameterChange {
+                max_connections: buf.get_i32_le(),
+                max_worker_processes: buf.get_i32_le(),
+                max_wal_senders: buf.get_i32_le(),
+                max_prepared_xacts: buf.get_i32_le(),
+                max_locks_per_xact: buf.get_i32_le(),
+                wal_level: buf.get_i32_le(),
+                wal_log_hints: buf.get_u8() != 0,
+                track_commit_timestamp: buf.get_u8() != 0,
+                _padding: [buf.get_u8(), buf.get_u8()],
+            }
+        }
+    }
 }
 
 pub mod v15 {
     pub use super::v14::{
         XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
+        XlParameterChange,
     };
 }
 
 pub mod v16 {
-    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
+    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange};
     use bytes::{Buf, Bytes};
     use postgres_ffi::{OffsetNumber, TransactionId};
 
@@ -529,6 +561,37 @@ pub mod v16 {
     }
 }
 
+pub mod v17 {
+    pub use super::v14::XlHeapLockUpdated;
+    use bytes::{Buf, Bytes};
+    pub use postgres_ffi::{TimeLineID, TimestampTz};
+
+    pub use super::v16::rm_neon;
+    pub use super::v16::{
+        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
+    };
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlEndOfRecovery {
+        pub end_time: TimestampTz,
+        pub this_time_line_id: TimeLineID,
+        pub prev_time_line_id: TimeLineID,
+        pub wal_level: i32,
+    }
+
+    impl XlEndOfRecovery {
+        pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery {
+            XlEndOfRecovery {
+                end_time: buf.get_i64_le(),
+                this_time_line_id: buf.get_u32_le(),
+                prev_time_line_id: buf.get_u32_le(),
+                wal_level: buf.get_i32_le(),
+            }
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrCreate {
@@ -746,9 +809,13 @@ pub struct XlClogTruncate {
 }
 
 impl XlClogTruncate {
-    pub fn decode(buf: &mut Bytes) -> XlClogTruncate {
+    pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate {
         XlClogTruncate {
-            pageno: buf.get_u32_le(),
+            pageno: if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            },
             oldest_xid: buf.get_u32_le(),
             oldest_xid_db: buf.get_u32_le(),
         }
diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h
new file mode 100644
index 0000000000..0a131816ef
--- /dev/null
+++ b/pgxn/neon/bitmap.h
@@ -0,0 +1,12 @@
+#ifndef NEON_BITMAP_H
+#define NEON_BITMAP_H
+
+/*
+ * Utilities for manipulating bits8* as bitmaps.
+ */
+
+#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
+#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
+#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
+
+#endif //NEON_BITMAP_H
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 479209a537..ab6739465b 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -27,6 +27,7 @@
 #include "pagestore_client.h"
 #include "common/hashfn.h"
 #include "pgstat.h"
+#include "port/pg_iovec.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
@@ -40,6 +41,7 @@
 #include "utils/guc.h"
 
 #include "hll.h"
+#include "bitmap.h"
 
 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
 
@@ -469,6 +471,99 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	return found;
 }
 
+/*
+ * Check if page is present in the cache.
+ * Returns true if page is found in local cache.
+ */
+int
+lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+					int nblocks, bits8 *bitmap)
+{
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	uint32		chunk_offs;
+	int			found = 0;
+	uint32		hash;
+	int			i = 0;
+
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+		return 0;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+
+	tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+	hash = get_hash_value(lfc_hash, &tag);
+	chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+
+	while (true)
+	{
+		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
+		if (LFC_ENABLED())
+		{
+			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+			if (entry != NULL)
+			{
+				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
+				{
+					if ((entry->bitmap[chunk_offs >> 5] & 
+						(1 << (chunk_offs & 31))) != 0)
+					{
+						BITMAP_SET(bitmap, i);
+						found++;
+					}
+				}
+			}
+			else
+			{
+				i += this_chunk;
+			}
+		}
+		else
+		{
+			return found;
+		}
+
+		/*
+		 * Break out of the iteration before doing expensive stuff for
+		 * a next iteration
+		 */
+		if (i + 1 >= nblocks)
+			break;
+
+		/*
+		 * Prepare for the next iteration. We don't unlock here, as that'd
+		 * probably be more expensive than the gains it'd get us.
+		 */
+		tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+		hash = get_hash_value(lfc_hash, &tag);
+		chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
+	}
+
+	LWLockRelease(lfc_lock);
+
+#if USE_ASSERT_CHECKING
+	do {
+		int count = 0;
+
+		for (int j = 0; j < nblocks; j++)
+		{
+			if (BITMAP_ISSET(bitmap, j))
+				count++;
+		}
+
+		Assert(count == found);
+	} while (false);
+#endif
+
+	return found;
+}
+
 /*
  * Evict a page (if present) from the local file cache
  */
@@ -548,91 +643,171 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 }
 
 /*
- * Try to read page from local cache.
- * Returns true if page is found in local cache.
- * In case of error local file cache is disabled (lfc->limit is set to zero).
+ * Try to read pages from local cache.
+ * Returns the number of pages read from the local cache, and sets bits in
+ * 'read' for the pages which were read. This may scribble over buffers not
+ * marked in 'read', so be careful with operation ordering.
+ *
+ * In case of error local file cache is disabled (lfc->limit is set to zero),
+ * and -1 is returned. Note that 'read' and the buffers may be touched and in
+ * an otherwise invalid state.
+ *
+ * If the mask argument is supplied, bits will be set at the offsets of pages
+ * that were present and read from the LFC.
  */
-bool
-lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		 char *buffer)
+int
+lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+				 void **buffers, BlockNumber nblocks, bits8 *mask)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 	bool		result = true;
 	uint32		hash;
 	uint64		generation;
 	uint32		entry_offset;
+	int			blocks_read = 0;
+	int			buf_offset = 0;
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
-		return false;
+		return 0;
 
 	if (!lfc_ensure_opened())
-		return false;
+		return 0;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-	hash = get_hash_value(lfc_hash, &tag);
 
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	if (!LFC_ENABLED())
+	/* 
+	 * For every chunk that has blocks we're interested in, we
+	 * 1. get the chunk header
+	 * 2. Check if the chunk actually has the blocks we're interested in
+	 * 3. Read the blocks we're looking for (in one preadv), assuming they exist
+	 * 4. Update the statistics for the read call.
+	 *
+	 * If there is an error, we do an early return.
+	 */
+	while (nblocks > 0)
 	{
+		struct iovec iov[PG_IOV_MAX];
+		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		int		iteration_hits = 0;
+		int		iteration_misses = 0;
+		Assert(blocks_in_chunk > 0);
+
+		for (int i = 0; i < blocks_in_chunk; i++)
+		{
+			iov[i].iov_base = buffers[buf_offset + i];
+			iov[i].iov_len = BLCKSZ;
+		}
+
+		tag.blockNum = blkno - chunk_offs;
+		hash = get_hash_value(lfc_hash, &tag);
+
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+		/* We can return the blocks we've read before LFC got disabled;
+		 * assuming we read any. */
+		if (!LFC_ENABLED())
+		{
+			LWLockRelease(lfc_lock);
+			return blocks_read;
+		}
+
+		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+		/* Approximate working set for the blocks assumed in this entry */
+		for (int i = 0; i < blocks_in_chunk; i++)
+		{
+			tag.blockNum = blkno + i;
+			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+		}
+
+		if (entry == NULL)
+		{
+			/* Pages are not cached */
+			lfc_ctl->misses += blocks_in_chunk;
+			pgBufferUsage.file_cache.misses += blocks_in_chunk;
+			LWLockRelease(lfc_lock);
+
+			buf_offset += blocks_in_chunk;
+			nblocks -= blocks_in_chunk;
+			blkno += blocks_in_chunk;
+
+			continue;
+		}
+
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
+		if (entry->access_count++ == 0)
+			dlist_delete(&entry->list_node);
+
+		generation = lfc_ctl->generation;
+		entry_offset = entry->offset;
+
 		LWLockRelease(lfc_lock);
-		return false;
-	}
 
-	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		for (int i = 0; i < blocks_in_chunk; i++)
+		{
+			/*
+			 * If the page is valid, we consider it "read".
+			 * All other pages will be fetched separately by the next cache
+			 */
+			if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32)))
+			{
+				BITMAP_SET(mask, buf_offset + i);
+				iteration_hits++;
+			}
+			else
+				iteration_misses++;
+		}
 
-	/* Approximate working set */
-	tag.blockNum = blkno;
-	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+		Assert(iteration_hits + iteration_misses > 0);
+
+		if (iteration_hits != 0)
+		{
+			rc = preadv(lfc_desc, iov, blocks_in_chunk,
+						((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+
+			if (rc != (BLCKSZ * blocks_in_chunk))
+			{
+				lfc_disable("read");
+				return -1;
+			}
+		}
+
+		/* Place entry to the head of LRU list */
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+		if (lfc_ctl->generation == generation)
+		{
+			CriticalAssert(LFC_ENABLED());
+			lfc_ctl->hits += iteration_hits;
+			lfc_ctl->misses += iteration_misses;
+			pgBufferUsage.file_cache.hits += iteration_hits;
+			pgBufferUsage.file_cache.misses += iteration_misses;
+			CriticalAssert(entry->access_count > 0);
+			if (--entry->access_count == 0)
+				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+		}
+		else
+		{
+			/* generation mismatch, assume error condition */
+			LWLockRelease(lfc_lock);
+			return -1;
+		}
 
-	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
-	{
-		/* Page is not cached */
-		lfc_ctl->misses += 1;
-		pgBufferUsage.file_cache.misses += 1;
 		LWLockRelease(lfc_lock);
-		return false;
-	}
-	/* Unlink entry from LRU list to pin it for the duration of IO operation */
-	if (entry->access_count++ == 0)
-		dlist_delete(&entry->list_node);
-	generation = lfc_ctl->generation;
-	entry_offset = entry->offset;
 
-	LWLockRelease(lfc_lock);
-
-	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-	if (rc != BLCKSZ)
-	{
-		lfc_disable("read");
-		return false;
+		buf_offset += blocks_in_chunk;
+		nblocks -= blocks_in_chunk;
+		blkno += blocks_in_chunk;
+		blocks_read += iteration_hits;
 	}
 
-	/* Place entry to the head of LRU list */
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	if (lfc_ctl->generation == generation)
-	{
-		CriticalAssert(LFC_ENABLED());
-		lfc_ctl->hits += 1;
-		pgBufferUsage.file_cache.hits += 1;
-		CriticalAssert(entry->access_count > 0);
-		if (--entry->access_count == 0)
-			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
-	}
-	else
-		result = false;
-
-	LWLockRelease(lfc_lock);
-
-	return result;
+	return blocks_read;
 }
 
 /*
@@ -640,20 +815,17 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
  * If cache is full then evict some other page.
  */
 void
-#if PG_MAJORVERSION_NUM < 16
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
-#else
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
-#endif
+lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		   const void *const *buffers, BlockNumber nblocks)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
 	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 	uint32		hash;
 	uint64		generation;
 	uint32		entry_offset;
+	int			buf_offset = 0;
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
@@ -661,110 +833,142 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	if (!lfc_ensure_opened())
 		return;
 
-	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-	hash = get_hash_value(lfc_hash, &tag);
 
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	if (!LFC_ENABLED())
+	/* 
+	 * For every chunk that has blocks we're interested in, we
+	 * 1. get the chunk header
+	 * 2. Check if the chunk actually has the blocks we're interested in
+	 * 3. Read the blocks we're looking for (in one preadv), assuming they exist
+	 * 4. Update the statistics for the read call.
+	 *
+	 * If there is an error, we do an early return.
+	 */
+	while (nblocks > 0)
 	{
-		LWLockRelease(lfc_lock);
-		return;
-	}
+		struct iovec iov[PG_IOV_MAX];
+		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		Assert(blocks_in_chunk > 0);
 
-	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-	if (found)
-	{
-		/*
-		 * Unlink entry from LRU list to pin it for the duration of IO
-		 * operation
-		 */
-		if (entry->access_count++ == 0)
-			dlist_delete(&entry->list_node);
-	}
-	else
-	{
-		/*
-		 * We have two choices if all cache pages are pinned (i.e. used in IO
-		 * operations):
-		 *
-		 * 1) Wait until some of this operation is completed and pages is
-		 * unpinned.
-		 *
-		 * 2) Allocate one more chunk, so that specified cache size is more
-		 * recommendation than hard limit.
-		 *
-		 * As far as probability of such event (that all pages are pinned) is
-		 * considered to be very very small: there are should be very large
-		 * number of concurrent IO operations and them are limited by
-		 * max_connections, we prefer not to complicate code and use second
-		 * approach.
-		 */
-		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
+		for (int i = 0; i < blocks_in_chunk; i++)
 		{
-			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-
-			CriticalAssert(victim->access_count == 0);
-			entry->offset = victim->offset; /* grab victim's chunk */
-			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-			neon_log(DEBUG2, "Swap file cache page");
+			iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]);
+			iov[i].iov_len = BLCKSZ;
 		}
-		else if (!dlist_is_empty(&lfc_ctl->holes))
+
+		tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+		hash = get_hash_value(lfc_hash, &tag);
+
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+		if (!LFC_ENABLED())
 		{
-			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-			uint32		offset = hole->offset;
-			bool		found;
+			LWLockRelease(lfc_lock);
+			return;
+		}
 
-			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
-			CriticalAssert(found);
+		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
 
-			lfc_ctl->used += 1;
-			entry->offset = offset;	/* reuse the hole */
+		if (found)
+		{
+			/*
+			 * Unlink entry from LRU list to pin it for the duration of IO
+			 * operation
+			 */
+			if (entry->access_count++ == 0)
+				dlist_delete(&entry->list_node);
 		}
 		else
 		{
-			lfc_ctl->used += 1;
-			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-												 * of file */
-		}
-		entry->access_count = 1;
-		entry->hash = hash;
-		memset(entry->bitmap, 0, sizeof entry->bitmap);
-	}
-
-	generation = lfc_ctl->generation;
-	entry_offset = entry->offset;
-	lfc_ctl->writes += 1;
-	LWLockRelease(lfc_lock);
-
-	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-	if (rc != BLCKSZ)
-	{
-		lfc_disable("write");
-	}
-	else
-	{
-		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-		if (lfc_ctl->generation == generation)
-		{
-			CriticalAssert(LFC_ENABLED());
-			/* Place entry to the head of LRU list */
-			CriticalAssert(entry->access_count > 0);
-			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
-
-			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
+			/*
+			 * We have two choices if all cache pages are pinned (i.e. used in IO
+			 * operations):
+			 *
+			 * 1) Wait until some of this operation is completed and pages is
+			 * unpinned.
+			 *
+			 * 2) Allocate one more chunk, so that specified cache size is more
+			 * recommendation than hard limit.
+			 *
+			 * As far as probability of such event (that all pages are pinned) is
+			 * considered to be very very small: there are should be very large
+			 * number of concurrent IO operations and them are limited by
+			 * max_connections, we prefer not to complicate code and use second
+			 * approach.
+			 */
+			if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
+			{
+				/* Cache overflow: evict least recently used chunk */
+				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+	
+				CriticalAssert(victim->access_count == 0);
+				entry->offset = victim->offset; /* grab victim's chunk */
+				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+				neon_log(DEBUG2, "Swap file cache page");
+			}
+			else if (!dlist_is_empty(&lfc_ctl->holes))
+			{
+				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
+				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
+				uint32		offset = hole->offset;
+				bool		found;
+	
+				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
+				CriticalAssert(found);
+	
+				lfc_ctl->used += 1;
+				entry->offset = offset;	/* reuse the hole */
+			}
+			else
+			{
+				lfc_ctl->used += 1;
+				entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
+													 * of file */
+			}
+			entry->access_count = 1;
+			entry->hash = hash;
+			memset(entry->bitmap, 0, sizeof entry->bitmap);
 		}
 
+		generation = lfc_ctl->generation;
+		entry_offset = entry->offset;
+		lfc_ctl->writes += blocks_in_chunk;
 		LWLockRelease(lfc_lock);
+
+		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
+					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+		if (rc != BLCKSZ * blocks_in_chunk)
+		{
+			lfc_disable("write");
+		}
+		else
+		{
+			LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+			if (lfc_ctl->generation == generation)
+			{
+				CriticalAssert(LFC_ENABLED());
+				/* Place entry to the head of LRU list */
+				CriticalAssert(entry->access_count > 0);
+				if (--entry->access_count == 0)
+					dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+
+				for (int i = 0; i < blocks_in_chunk; i++)
+				{
+					entry->bitmap[(chunk_offs + i) >> 5] |=
+						(1 << ((chunk_offs + i) & 31));
+				}
+			}
+
+			LWLockRelease(lfc_lock);
+		}
+		blkno += blocks_in_chunk;
+		buf_offset += blocks_in_chunk;
+		nblocks -= blocks_in_chunk;
 	}
 }
 
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5126c26c5d..df7000acc0 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -537,7 +537,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		/* No more polling needed; connection succeeded */
 		shard->last_connect_time = GetCurrentTimestamp();
 
+#if PG_MAJORVERSION_NUM >= 17
+		shard->wes_read = CreateWaitEventSet(NULL, 3);
+#else
 		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+#endif
 		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
 						  MyLatch, NULL);
 		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index addb6ccce6..59b97d64fe 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -6,7 +6,11 @@
 #ifndef NEON_PGVERSIONCOMPAT_H
 #define NEON_PGVERSIONCOMPAT_H
 
+#if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
+#else
+#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER)
+#endif
 
 #define RelFileInfoEquals(a, b) ( \
 	NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \
@@ -50,7 +54,7 @@
 #define CopyNRelFileInfoToBufTag(tag, rinfo) \
 	do { \
 		(tag).rnode = (rinfo); \
-	} while (false);
+	} while (false)
 
 #define BufTagGetNRelFileInfo(tag) tag.rnode
 
@@ -98,7 +102,7 @@
 		(tag).spcOid = (rinfo).spcOid; \
 		(tag).dbOid = (rinfo).dbOid; \
 		(tag).relNumber = (rinfo).relNumber; \
-	} while (false);
+	} while (false)
 
 #define BufTagGetNRelFileInfo(tag) \
 	((RelFileLocator) { \
@@ -113,4 +117,10 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif
 
+#if PG_MAJORVERSION_NUM < 17
+#define ProcNumber BackendId
+#define INVALID_PROC_NUMBER InvalidBackendId
+#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
+#endif
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 1f196d016c..4c9e40a063 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -6,8 +6,6 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * contrib/neon/pagestore_client.h
- *
  *-------------------------------------------------------------------------
  */
 #ifndef pageserver_h
@@ -187,7 +185,7 @@ extern char *nm_to_string(NeonMessage *msg);
  * API
  */
 
-typedef unsigned shardno_t;
+typedef uint16 shardno_t;
 
 typedef struct
 {
@@ -211,7 +209,7 @@ extern int  neon_protocol_version;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
-extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
+extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
 
@@ -233,8 +231,13 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber blocknum, int nbuffers, bool skipFsync);
 #endif
 
+#if PG_MAJORVERSION_NUM >=17
+extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum, int nblocks);
+#else
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
+#endif
 
 /*
  * LSN values associated with each request to the pageserver
@@ -269,19 +272,11 @@ typedef struct
 } neon_request_lsns;
 
 #if PG_MAJORVERSION_NUM < 16
-extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, char *buffer);
-extern void neon_write(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
-extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, void *buffer);
-extern void neon_write(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
 extern void neon_writeback(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum, BlockNumber nblocks);
@@ -299,17 +294,34 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
 
 /* functions for local file cache */
-#if PG_MAJORVERSION_NUM < 16
-extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-					  char *buffer);
-#else
-extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-					  const void *buffer);
-#endif
-extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer);
-extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
+					   BlockNumber blkno, const void *const *buffers,
+					   BlockNumber nblocks);
+/* returns number of blocks read, with one bit set in *read for each  */
+extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
+							BlockNumber blkno, void **buffers,
+							BlockNumber nblocks, bits8 *mask);
+
+extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno);
+extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno, int nblocks, bits8 *bitmap);
 extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);
 
+static inline bool
+lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		 void *buffer)
+{
+	bits8		rv = 0;
+	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
+}
+
+static inline void
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		  const void *buffer)
+{
+	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
+}
 
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 7f39c7d026..36538ea5e2 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -58,6 +58,7 @@
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/interrupt.h"
+#include "port/pg_iovec.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
@@ -66,6 +67,7 @@
 #include "storage/smgr.h"
 
 #include "pagestore_client.h"
+#include "bitmap.h"
 
 #if PG_VERSION_NUM >= 150000
 #include "access/xlogrecovery.h"
@@ -170,16 +172,28 @@ typedef enum PrefetchStatus
 								 * valid */
 } PrefetchStatus;
 
+/* must fit in uint8; bits 0x1 are used */
+typedef enum {
+	PRFSF_NONE	= 0x0,
+	PRFSF_SEQ	= 0x1,
+} PrefetchRequestFlags;
+
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
+	shardno_t	shard_no;
+	uint8		status;		/* see PrefetchStatus for valid values */
+	uint8		flags;		/* see PrefetchRequestFlags */
 	neon_request_lsns request_lsns;
 	NeonResponse *response;		/* may be null */
-	PrefetchStatus status;
-	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;
 
+StaticAssertDecl(sizeof(PrefetchRequest) == 64,
+				 "We prefer to have a power-of-2 size for this struct. Please"
+				 " try to find an alternative solution before reaching to"
+				 " increase the expected size here");
+
 /* prefetch buffer lookup hash table */
 
 typedef struct PrfHashEntry
@@ -251,17 +265,17 @@ typedef struct PrefetchState
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
-#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
-#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
-#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
-
 static PrefetchState *MyPState;
 
+#define GetPrfSlotNoCheck(ring_index) ( \
+	&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+)
+
 #define GetPrfSlot(ring_index) ( \
 	( \
 		AssertMacro((ring_index) < MyPState->ring_unused && \
 					(ring_index) >= MyPState->ring_last), \
-		&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+		GetPrfSlotNoCheck(ring_index) \
 	) \
 )
 
@@ -281,9 +295,17 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
+#if PG_MAJORVERSION_NUM < 17
+static void
+GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum,
+				   BlockNumber blkno, int nblocks, XLogRecPtr *lsns);
+#endif
 
-static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
-static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
+static void
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
+					  BlockNumber blkno, neon_request_lsns *output,
+					  BlockNumber nblocks, const bits8 *mask);
+static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 										  PrefetchRequest *slot);
 
 static bool
@@ -729,9 +751,9 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
-		slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
-												   slot->buftag.forkNum,
-												   slot->buftag.blockNum);
+		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+							  slot->buftag.forkNum, slot->buftag.blockNum,
+							  &slot->request_lsns, 1, NULL);
 	request.req.lsn = slot->request_lsns.request_lsn;
 	request.req.not_modified_since = slot->request_lsns.not_modified_since;
 
@@ -771,141 +793,194 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
+prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+						  BlockNumber nblocks, const bits8 *mask)
 {
-	uint64		ring_index;
+	uint64		min_ring_index;
 	PrefetchRequest req;
-	PrefetchRequest *slot;
-	PrfHashEntry *entry;
+#if USE_ASSERT_CHECKING
+	bool		any_hits = false;
+#endif
+	/* We will never read further ahead than our buffer can store. */
+	nblocks = Max(1, Min(nblocks, readahead_buffer_size));
 
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
+
 Retry:
-	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
-
-	if (entry != NULL)
+	min_ring_index = UINT64_MAX;
+	for (int i = 0; i < nblocks; i++)
 	{
-		slot = entry->slot;
-		ring_index = slot->my_ring_index;
-		Assert(slot == GetPrfSlot(ring_index));
+		PrefetchRequest *slot = NULL;
+		PrfHashEntry *entry = NULL;
+		uint64		ring_index;
+		neon_request_lsns *lsns;
+		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+			continue;
 
-		Assert(slot->status != PRFS_UNUSED);
-		Assert(MyPState->ring_last <= ring_index &&
-			   ring_index < MyPState->ring_unused);
-		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
+		if (frlsns)
+			lsns = &frlsns[i];
+		else
+			lsns = NULL;
 
-		/*
-		 * If the caller specified a request LSN to use, only accept prefetch
-		 * responses that satisfy that request.
-		 */
-		if (force_request_lsns)
-		{
-			if (!neon_prefetch_response_usable(*force_request_lsns, slot))
-			{
-				/* Wait for the old request to finish and discard it */
-				if (!prefetch_wait_for(ring_index))
-					goto Retry;
-				prefetch_set_unused(ring_index);
-				entry = NULL;
-			}
-		}
+#if USE_ASSERT_CHECKING
+		any_hits = true;
+#endif
+
+		slot = NULL;
+		entry = NULL;
+
+		req.buftag.blockNum = tag.blockNum + i;
+		entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
 
 		if (entry != NULL)
 		{
+			slot = entry->slot;
+			ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));
+
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
+			Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag));
+
 			/*
-			 * We received a prefetch for a page that was recently read and
-			 * removed from the buffers. Remove that request from the buffers.
+			 * If the caller specified a request LSN to use, only accept
+			 * prefetch responses that satisfy that request.
 			 */
-			if (slot->status == PRFS_TAG_REMAINS)
+			if (lsns)
 			{
-				prefetch_set_unused(ring_index);
-				entry = NULL;
+				if (!neon_prefetch_response_usable(lsns, slot))
+				{
+					/* Wait for the old request to finish and discard it */
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+				}
+			}
+
+			if (entry != NULL)
+			{
+				/*
+				 * We received a prefetch for a page that was recently read
+				 * and removed from the buffers. Remove that request from the
+				 * buffers.
+				 */
+				if (slot->status == PRFS_TAG_REMAINS)
+				{
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+				}
+				else
+				{
+					min_ring_index = Min(min_ring_index, ring_index);
+					/* The buffered request is good enough, return that index */
+					pgBufferUsage.prefetch.duplicates++;
+					continue;
+				}
+			}
+		}
+
+		/*
+		 * We can only leave the block above by finding that there's
+		 * no entry that can satisfy this request, either because there
+		 * was no entry, or because the entry was invalid or didn't satisfy
+		 * the LSNs provided.
+		 *
+		 * The code should've made sure to clear up the data.
+		 */
+		Assert(entry == NULL);
+		Assert(slot == NULL);
+
+		/*
+		 * If the prefetch queue is full, we need to make room by clearing the
+		 * oldest slot. If the oldest slot holds a buffer that was already
+		 * received, we can just throw it away; we fetched the page
+		 * unnecessarily in that case. If the oldest slot holds a request that
+		 * we haven't received a response for yet, we have to wait for the
+		 * response to that before we can continue. We might not have even
+		 * flushed the request to the pageserver yet, it might be just sitting
+		 * in the output buffer. In that case, we flush it and wait for the
+		 * response. (We could decide not to send it, but it's hard to abort
+		 * when the request is already in the output buffer, and 'not sending'
+		 * a prefetch request kind of goes against the principles of
+		 * prefetching)
+		 */
+		if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
+		{
+			uint64		cleanup_index = MyPState->ring_last;
+
+			slot = GetPrfSlot(cleanup_index);
+
+			Assert(slot->status != PRFS_UNUSED);
+
+			/*
+			 * If there is good reason to run compaction on the prefetch buffers,
+			 * try to do that.
+			 */
+			if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
+			{
+				Assert(slot->status == PRFS_UNUSED);
 			}
 			else
 			{
-				/* The buffered request is good enough, return that index */
-				pgBufferUsage.prefetch.duplicates++;
-				return ring_index;
+				/*
+				 * We have the slot for ring_last, so that must still be in
+				 * progress
+				 */
+				switch (slot->status)
+				{
+					case PRFS_REQUESTED:
+						Assert(MyPState->ring_receive == cleanup_index);
+						if (!prefetch_wait_for(cleanup_index))
+							goto Retry;
+						prefetch_set_unused(cleanup_index);
+						break;
+					case PRFS_RECEIVED:
+					case PRFS_TAG_REMAINS:
+						prefetch_set_unused(cleanup_index);
+						break;
+					default:
+						pg_unreachable();
+				}
 			}
 		}
-	}
-
-	/*
-	 * If the prefetch queue is full, we need to make room by clearing the
-	 * oldest slot. If the oldest slot holds a buffer that was already
-	 * received, we can just throw it away; we fetched the page unnecessarily
-	 * in that case. If the oldest slot holds a request that we haven't
-	 * received a response for yet, we have to wait for the response to that
-	 * before we can continue. We might not have even flushed the request to
-	 * the pageserver yet, it might be just sitting in the output buffer. In
-	 * that case, we flush it and wait for the response. (We could decide not
-	 * to send it, but it's hard to abort when the request is already in the
-	 * output buffer, and 'not sending' a prefetch request kind of goes
-	 * against the principles of prefetching)
-	 */
-	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
-	{
-		uint64		cleanup_index = MyPState->ring_last;
-
-		slot = GetPrfSlot(cleanup_index);
-
-		Assert(slot->status != PRFS_UNUSED);
 
 		/*
-		 * If there is good reason to run compaction on the prefetch buffers,
-		 * try to do that.
+		 * The next buffer pointed to by `ring_unused` is now definitely empty, so
+		 * we can insert the new request to it.
 		 */
-		if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
-		{
-			Assert(slot->status == PRFS_UNUSED);
-		}
-		else
-		{
-			/*
-			 * We have the slot for ring_last, so that must still be in
-			 * progress
-			 */
-			switch (slot->status)
-			{
-				case PRFS_REQUESTED:
-					Assert(MyPState->ring_receive == cleanup_index);
-					if (!prefetch_wait_for(cleanup_index))
-						goto Retry;
-					prefetch_set_unused(cleanup_index);
-					break;
-				case PRFS_RECEIVED:
-				case PRFS_TAG_REMAINS:
-					prefetch_set_unused(cleanup_index);
-					break;
-				default:
-					pg_unreachable();
-			}
-		}
+		ring_index = MyPState->ring_unused;
+
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index <= MyPState->ring_unused);
+
+		slot = GetPrfSlotNoCheck(ring_index);
+
+		Assert(slot->status == PRFS_UNUSED);
+
+		/*
+		 * We must update the slot data before insertion, because the hash
+		 * function reads the buffer tag from the slot.
+		 */
+		slot->buftag = req.buftag;
+		slot->shard_no = get_shard_number(&tag);
+		slot->my_ring_index = ring_index;
+
+		min_ring_index = Min(min_ring_index, ring_index);
+
+		prefetch_do_request(slot, lsns);
 	}
 
-	/*
-	 * The next buffer pointed to by `ring_unused` is now definitely empty, so
-	 * we can insert the new request to it.
-	 */
-	ring_index = MyPState->ring_unused;
-	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
+	Assert(any_hits);
 
-	Assert(MyPState->ring_last <= ring_index);
-
-	Assert(slot->status == PRFS_UNUSED);
-
-	/*
-	 * We must update the slot data before insertion, because the hash
-	 * function reads the buffer tag from the slot.
-	 */
-	slot->buftag = tag;
-	slot->shard_no = get_shard_number(&tag);
-	slot->my_ring_index = ring_index;
-
-	prefetch_do_request(slot, force_request_lsns);
-	Assert(slot->status == PRFS_REQUESTED);
-	Assert(MyPState->ring_last <= ring_index &&
-		   ring_index < MyPState->ring_unused);
+	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
+		   GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
+	Assert(MyPState->ring_last <= min_ring_index &&
+		   min_ring_index < MyPState->ring_unused);
 
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
@@ -921,9 +996,17 @@ Retry:
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
 
-	return ring_index;
+	return min_ring_index;
 }
 
+
+static uint64
+prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
+{
+	return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL);
+}
+
+
 /*
  * Note: this function can get canceled and use a long jump to the next catch
  * context. Take care.
@@ -1348,6 +1431,50 @@ log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std);
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+/*
+ * Wrapper around log_newpages() that makes a temporary copy of the block and
+ * WAL-logs that. This makes it safe to use while holding only a shared lock
+ * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint
+ * directly because it skips the logging if the LSN is new enough.
+ */
+static XLogRecPtr
+log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
+				  BlockNumber nblocks, Page *pages, bool page_std)
+{
+	PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID];
+	BlockNumber	blknos[XLR_MAX_BLOCK_ID];
+	Page		pageptrs[XLR_MAX_BLOCK_ID];
+	int			nregistered = 0;
+	XLogRecPtr	result = 0;
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		Page	page = copied_buffer[nregistered].data;
+		memcpy(page, pages[i], BLCKSZ);
+		pageptrs[nregistered] = page;
+		blknos[nregistered] = blkno + i;
+
+		++nregistered;
+
+		if (nregistered >= XLR_MAX_BLOCK_ID)
+		{
+			log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs,
+						 page_std);
+			nregistered = 0;
+		}
+	}
+
+	if (nregistered != 0)
+	{
+		log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs,
+					 page_std);
+	}
+
+	return ProcLastRecPtr;
+}
+#endif /* PG_MAJORVERSION_NUM >= 17 */
+
 /*
  * Is 'buffer' identical to a freshly initialized empty heap page?
  */
@@ -1361,14 +1488,160 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				  BlockNumber nblocks, const char **buffers, bool force)
+{
+#define BLOCK_BATCH_SIZE	16
+	bool		log_pages;
+	BlockNumber	batch_blockno = blocknum;
+	XLogRecPtr	lsns[BLOCK_BATCH_SIZE];
+	int			batch_size = 0;
+
+	/*
+	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
+	 * changes are not WAL-logged when the changes are made, so this is our
+	 * last chance to log them, otherwise they're lost. That's OK for
+	 * correctness, the non-logged updates are not critical. But we want to
+	 * have a reasonably up-to-date VM and FSM in the page server.
+	 */
+	log_pages = false;
+	if (force)
+	{
+		Assert(XLogInsertAllowed());
+		log_pages = true;
+	}
+	else if (XLogInsertAllowed() &&
+			 !ShutdownRequestPending &&
+			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	{
+		log_pages = true;
+	}
+
+	if (log_pages)
+	{
+		XLogRecPtr	recptr;
+		recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
+								   nblocks, (Page *) buffers, false);
+
+		for (int i = 0; i < nblocks; i++)
+			PageSetLSN(unconstify(char *, buffers[i]), recptr);
+
+		ereport(SmgrTrace,
+				(errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u "
+								 "were force logged, lsn=%X/%X",
+						blocknum, blocknum + nblocks,
+						RelFileInfoFmt(InfoFromSMgrRel(reln)),
+						forknum, LSN_FORMAT_ARGS(recptr))));
+	}
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		Page		page = (Page) buffers[i];
+		BlockNumber blkno = blocknum + i;
+		XLogRecPtr	lsn = PageGetLSN(page);
+
+		if (lsn == InvalidXLogRecPtr)
+		{
+			/*
+			 * When PostgreSQL extends a relation, it calls smgrextend() with an
+			 * all-zeros pages, and we can just ignore that in Neon. We do need to
+			 * remember the new size, though, so that smgrnblocks() returns the
+			 * right answer after the rel has been extended. We rely on the
+			 * relsize cache for that.
+			 *
+			 * A completely empty heap page doesn't need to be WAL-logged, either.
+			 * The heapam can leave such a page behind, if e.g. an insert errors
+			 * out after initializing the page, but before it has inserted the
+			 * tuple and WAL-logged the change. When we read the page from the
+			 * page server, it will come back as all-zeros. That's OK, the heapam
+			 * will initialize an all-zeros page on first use.
+			 *
+			 * In other scenarios, evicting a dirty page with no LSN is a bad
+			 * sign: it implies that the page was not WAL-logged, and its contents
+			 * will be lost when it's evicted.
+			 */
+			if (PageIsNew(page))
+			{
+				ereport(SmgrTrace,
+						(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
+								blkno,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forknum)));
+			}
+			else if (PageIsEmptyHeapPage(page))
+			{
+				ereport(SmgrTrace,
+						(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
+								blkno,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forknum)));
+			}
+			else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
+			{
+				/*
+				 * Its a bad sign if there is a page with zero LSN in the buffer
+				 * cache in a standby, too. However, PANICing seems like a cure
+				 * worse than the disease, as the damage has likely already been
+				 * done in the primary. So in a standby, make this an assertion,
+				 * and in a release build just LOG the error and soldier on. We
+				 * update the last-written LSN of the page with a conservative
+				 * value in that case, which is the last replayed LSN.
+				 */
+				ereport(RecoveryInProgress() ? LOG : PANIC,
+						(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
+								blkno,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forknum)));
+				Assert(false);
+
+				lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
+			}
+		}
+		else
+		{
+			ereport(SmgrTrace,
+					(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
+							blkno,
+							RelFileInfoFmt(InfoFromSMgrRel(reln)),
+							forknum, LSN_FORMAT_ARGS(lsn))));
+		}
+
+		/*
+		 * Remember the LSN on this page. When we read the page again, we must
+		 * read the same or newer version of it.
+		 */
+		lsns[batch_size++] = lsn;
+
+		if (batch_size >= BLOCK_BATCH_SIZE)
+		{
+			SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum,
+									   batch_blockno,
+									   batch_size);
+			batch_blockno += batch_size;
+			batch_size = 0;
+		}
+	}
+
+	if (batch_size != 0)
+	{
+		SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum,
+								   batch_blockno,
+								   batch_size);
+	}
+}
+#endif
+
 /*
  * A page is being evicted from the shared buffer cache. Update the
  * last-written LSN of the page, and WAL-log it if needed.
  */
-static void
 #if PG_MAJORVERSION_NUM < 16
+static void
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
 #else
+static void
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
 #endif
 {
@@ -1548,18 +1821,39 @@ nm_adjust_lsn(XLogRecPtr lsn)
 	return lsn;
 }
 
+
+/*
+ * Since PG17 we use vetorized version,
+ * so add compatibility function for older versions
+ */
+#if PG_MAJORVERSION_NUM < 17
+static void
+GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum,
+				   BlockNumber blkno, int nblocks, XLogRecPtr *lsns)
+{
+	lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno);
+}
+#endif
+
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static neon_request_lsns
-neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
+static void
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+					  neon_request_lsns *output, BlockNumber nblocks,
+					  const bits8 *mask)
 {
-	XLogRecPtr	last_written_lsn;
-	neon_request_lsns result;
+	XLogRecPtr	last_written_lsns[PG_IOV_MAX];
 
-	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-	last_written_lsn = nm_adjust_lsn(last_written_lsn);
-	Assert(last_written_lsn != InvalidXLogRecPtr);
+	Assert(nblocks <= PG_IOV_MAX);
+
+	GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns);
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]);
+		Assert(last_written_lsns[i] != InvalidXLogRecPtr);
+	}
 
 	if (RecoveryInProgress())
 	{
@@ -1630,95 +1924,111 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		/* Request the page at the end of the last fully replayed LSN. */
 		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
 
-		if (last_written_lsn > replay_lsn)
+		for (int i = 0; i < nblocks; i++)
 		{
-			/* GetCurrentReplayRecPtr was introduced in v15 */
+			neon_request_lsns *result = &output[i];
+			XLogRecPtr	last_written_lsn = last_written_lsns[i];
+
+			if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+				continue;
+
+			if (last_written_lsn > replay_lsn)
+			{
+				/* GetCurrentReplayRecPtr was introduced in v15 */
 #if PG_VERSION_NUM >= 150000
-			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
+				Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
 #endif
 
-			/*
-			 * Cases 2 and 4. If this is a backend (case 4), the
-			 * neon_read_at_lsn() call later will wait for the WAL record to be
-			 * fully replayed.
-			 */
-			result.request_lsn = last_written_lsn;
-		}
-		else
-		{
-			/* cases 1 and 3 */
-			result.request_lsn = replay_lsn;
-		}
-		result.not_modified_since = last_written_lsn;
-		result.effective_request_lsn = result.request_lsn;
-		Assert(last_written_lsn <= result.request_lsn);
+				/*
+				 * Cases 2 and 4. If this is a backend (case 4), the
+				 * neon_read_at_lsn() call later will wait for the WAL record to be
+				 * fully replayed.
+				 */
+				result->request_lsn = last_written_lsn;
+			}
+			else
+			{
+				/* cases 1 and 3 */
+				result->request_lsn = replay_lsn;
+			}
 
-		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
+			result->not_modified_since = last_written_lsn;
+			result->effective_request_lsn = result->request_lsn;
+			Assert(last_written_lsn <= result->request_lsn);
+
+			neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
+					 LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since));
+		}
 	}
 	else
 	{
 		XLogRecPtr	flushlsn;
-
-		/*
-		 * Use the latest LSN that was evicted from the buffer cache as the
-		 * 'not_modified_since' hint. Any pages modified by later WAL records
-		 * must still in the buffer cache, so our request cannot concern
-		 * those.
-		 */
-		neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
-				 LSN_FORMAT_ARGS(last_written_lsn));
-
-		/*
-		 * Is it possible that the last-written LSN is ahead of last flush
-		 * LSN? Generally not, we shouldn't evict a page from the buffer cache
-		 * before all its modifications have been safely flushed. That's the
-		 * "WAL before data" rule. However, such case does exist at index
-		 * building, _bt_blwritepage logs the full page without flushing WAL
-		 * before smgrextend (files are fsynced before build ends).
-		 */
 #if PG_VERSION_NUM >= 150000
 		flushlsn = GetFlushRecPtr(NULL);
 #else
 		flushlsn = GetFlushRecPtr();
 #endif
-		if (last_written_lsn > flushlsn)
+
+		for (int i = 0; i < nblocks; i++)
 		{
-			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
-					 LSN_FORMAT_ARGS(last_written_lsn),
-					 LSN_FORMAT_ARGS(flushlsn));
-			XLogFlush(last_written_lsn);
-			flushlsn = last_written_lsn;
+			neon_request_lsns *result = &output[i];
+			XLogRecPtr	last_written_lsn = last_written_lsns[i];
+
+			if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+				continue;
+			/*
+			 * Use the latest LSN that was evicted from the buffer cache as the
+			 * 'not_modified_since' hint. Any pages modified by later WAL records
+			 * must still in the buffer cache, so our request cannot concern
+			 * those.
+			 */
+			neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
+					 LSN_FORMAT_ARGS(last_written_lsn));
+
+			/*
+			 * Is it possible that the last-written LSN is ahead of last flush
+			 * LSN? Generally not, we shouldn't evict a page from the buffer cache
+			 * before all its modifications have been safely flushed. That's the
+			 * "WAL before data" rule. However, such case does exist at index
+			 * building, _bt_blwritepage logs the full page without flushing WAL
+			 * before smgrextend (files are fsynced before build ends).
+			 */
+			if (last_written_lsn > flushlsn)
+			{
+				neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+						 LSN_FORMAT_ARGS(last_written_lsn),
+						 LSN_FORMAT_ARGS(flushlsn));
+				XLogFlush(last_written_lsn);
+				flushlsn = last_written_lsn;
+			}
+
+			/*
+			 * Request the very latest version of the page. In principle we
+			 * want to read the page at the current insert LSN, and we could
+			 * use that value in the request. However, there's a corner case
+			 * with pageserver's garbage collection. If the GC horizon is
+			 * set to a very small value, it's possible that by the time
+			 * that the pageserver processes our request, the GC horizon has
+			 * already moved past the LSN we calculate here. Standby servers
+			 * always have that problem as the can always lag behind the
+			 * primary, but for the primary we can avoid it by always
+			 * requesting the latest page, by setting request LSN to
+			 * UINT64_MAX.
+			 *
+			 * Remember the current LSN, however, so that we can later
+			 * correctly determine if the response to the request is still
+			 * valid. The most up-to-date LSN we could use for that purpose
+			 * would be the current insert LSN, but to avoid the overhead of
+			 * looking it up, use 'flushlsn' instead. This relies on the
+			 * assumption that if the page was modified since the last WAL
+			 * flush, it should still be in the buffer cache, and we
+			 * wouldn't be requesting it.
+			 */
+			result->request_lsn = UINT64_MAX;
+			result->not_modified_since = last_written_lsn;
+			result->effective_request_lsn = flushlsn;
 		}
-
-		/*
-		 * Request the very latest version of the page. In principle we
-		 * want to read the page at the current insert LSN, and we could
-		 * use that value in the request. However, there's a corner case
-		 * with pageserver's garbage collection. If the GC horizon is
-		 * set to a very small value, it's possible that by the time
-		 * that the pageserver processes our request, the GC horizon has
-		 * already moved past the LSN we calculate here. Standby servers
-		 * always have that problem as the can always lag behind the
-		 * primary, but for the primary we can avoid it by always
-		 * requesting the latest page, by setting request LSN to
-		 * UINT64_MAX.
-		 *
-		 * Remember the current LSN, however, so that we can later
-		 * correctly determine if the response to the request is still
-		 * valid. The most up-to-date LSN we could use for that purpose
-		 * would be the current insert LSN, but to avoid the overhead of
-		 * looking it up, use 'flushlsn' instead. This relies on the
-		 * assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we
-		 * wouldn't be requesting it.
-		 */
-		result.request_lsn = UINT64_MAX;
-		result.not_modified_since = last_written_lsn;
-		result.effective_request_lsn = flushlsn;
 	}
-
-	return result;
 }
 
 /*
@@ -1728,13 +2038,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
  * satisfy a page read now.
  */
 static bool
-neon_prefetch_response_usable(neon_request_lsns request_lsns,
+neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 							  PrefetchRequest *slot)
 {
 	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
-	Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
-	Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
+	Assert(request_lsns->request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn);
 	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
 	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
 	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
@@ -1755,15 +2065,15 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
-		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
+	if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn ||
+		request_lsns->not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
-						   LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(request_lsns->effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns->not_modified_since),
 						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
 						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
@@ -1817,9 +2127,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 */
 
 	/* this follows from the checks above */
-	Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since);
 
-	return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
+	return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn;
 }
 
 /*
@@ -1886,7 +2196,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
@@ -2068,7 +2379,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	 */
 	if (max_cluster_size > 0 &&
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
-		!IsAutoVacuumWorkerProcess())
+		!AmAutoVacuumWorkerProcess())
 	{
 		uint64		current_size = GetNeonCurrentClusterSize();
 
@@ -2149,7 +2460,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 
 	if (max_cluster_size > 0 &&
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
-		!IsAutoVacuumWorkerProcess())
+		!AmAutoVacuumWorkerProcess())
 	{
 		uint64		current_size = GetNeonCurrentClusterSize();
 
@@ -2247,6 +2558,73 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 }
 
 
+#if PG_MAJORVERSION_NUM >= 17
+/*
+ *	neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			  int nblocks)
+{
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
+	BufferTag	tag;
+	bool		io_initiated = false;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:					/* probably shouldn't happen, but ignore it */
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdprefetch(reln, forknum, blocknum, nblocks);
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	tag.spcOid = reln->smgr_rlocator.locator.spcOid;
+	tag.dbOid = reln->smgr_rlocator.locator.dbOid;
+	tag.relNumber = reln->smgr_rlocator.locator.relNumber;
+	tag.forkNum = forknum;
+
+	while (nblocks > 0)
+	{
+		int		iterblocks = Min(nblocks, PG_IOV_MAX);
+		int		seqlen = 0;
+		bits8		lfc_present[PG_IOV_MAX / 8];
+		memset(lfc_present, 0, sizeof(lfc_present));
+
+		if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum,
+								iterblocks, lfc_present) == iterblocks)
+		{
+			nblocks -= iterblocks;
+			blocknum += iterblocks;
+			continue;
+		}
+
+		io_initiated = true;
+
+		tag.blockNum = blocknum;
+		
+		for (int i = 0; i < PG_IOV_MAX / 8; i++)
+			lfc_present[i] = ~(lfc_present[i]);
+
+		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
+											   lfc_present);
+		nblocks -= iterblocks;
+		blocknum += iterblocks;
+
+		Assert(ring_index < MyPState->ring_unused &&
+			   MyPState->ring_last <= ring_index);
+	}
+
+	return false;
+}
+
+
+#else /* PG_MAJORVERSION_NUM >= 17 */
 /*
  *	neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
  */
@@ -2285,6 +2663,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	return false;
 }
+#endif /* PG_MAJORVERSION_NUM < 17 */
+
 
 /*
  * neon_writeback() -- Tell the kernel to write pages back to storage.
@@ -2315,7 +2695,12 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	/* not implemented */
+	/*
+	 * TODO: WAL sync up to lwLsn for the indicated blocks
+	 * Without that sync, writeback doesn't actually guarantee the data is
+	 * persistently written, which does seem to be one of the assumed
+	 * properties of this smgr API call.
+	 */
 	neon_log(SmgrTrace, "writeback noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -2324,30 +2709,27 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 #endif
 }
 
-/*
- * While function is defined in the neon extension it's used within neon_test_utils directly.
- * To avoid breaking tests in the runtime please keep function signature in sync.
- */
-void
+static void
 #if PG_MAJORVERSION_NUM < 16
-neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 neon_request_lsns request_lsns, char *buffer)
+neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
+				  char **buffers, BlockNumber nblocks, const bits8 *mask)
 #else
-neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 neon_request_lsns request_lsns, void *buffer)
+neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
+				  void **buffers, BlockNumber nblocks, const bits8 *mask)
 #endif
 {
 	NeonResponse *resp;
 	uint64		ring_index;
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;
-	BufferTag	buftag =
-	{
-		.forkNum = forkNum,
-		.blockNum = blkno,
-	};
+	BufferTag	buftag = {0};
+
+	Assert(PointerIsValid(request_lsns));
+	Assert(nblocks >= 1);
 
 	CopyNRelFileInfoToBufTag(buftag, rinfo);
+	buftag.forkNum = forkNum;
+	buftag.blockNum = base_blockno;
 
 	/*
 	 * The redo process does not lock pages that it needs to replay but are
@@ -2365,115 +2747,147 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
 	 * value of the LwLsn cache when the entry is not found.
 	 */
-	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
-		XLogWaitForReplayOf(request_lsns.request_lsn);
+	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask);
 
-	/*
-	 * Try to find prefetched page in the list of received pages.
-	 */
+	for (int i = 0; i < nblocks; i++)
+	{
+		void	   *buffer = buffers[i];
+		BlockNumber blockno = base_blockno + i;
+		neon_request_lsns *reqlsns = &request_lsns[i];
+
+		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+			continue;
+
+		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
+			XLogWaitForReplayOf(reqlsns[0].request_lsn);
+
+		/*
+		 * Try to find prefetched page in the list of received pages.
+		 */
 Retry:
-	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
+		buftag.blockNum = blockno;
+		entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
 
-	if (entry != NULL)
-	{
-		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsns, slot))
+		if (entry != NULL)
 		{
-			ring_index = slot->my_ring_index;
-			pgBufferUsage.prefetch.hits += 1;
-		}
-		else
-		{
-			/*
-			 * Cannot use this prefetch, discard it
-			 *
-			 * We can't drop cache for not-yet-received requested items. It is
-			 * unlikely this happens, but it can happen if prefetch distance
-			 * is large enough and a backend didn't consume all prefetch
-			 * requests.
-			 */
-			if (slot->status == PRFS_REQUESTED)
+			slot = entry->slot;
+			if (neon_prefetch_response_usable(reqlsns, slot))
 			{
-				if (!prefetch_wait_for(slot->my_ring_index))
-					goto Retry;
+				ring_index = slot->my_ring_index;
+				pgBufferUsage.prefetch.hits += 1;
+			}
+			else
+			{
+				/*
+				 * Cannot use this prefetch, discard it
+				 *
+				 * We can't drop cache for not-yet-received requested items. It is
+				 * unlikely this happens, but it can happen if prefetch distance
+				 * is large enough and a backend didn't consume all prefetch
+				 * requests.
+				 */
+				if (slot->status == PRFS_REQUESTED)
+				{
+					if (!prefetch_wait_for(slot->my_ring_index))
+						goto Retry;
+				}
+				/* drop caches */
+				prefetch_set_unused(slot->my_ring_index);
+				pgBufferUsage.prefetch.expired += 1;
+				/* make it look like a prefetch cache miss */
+				entry = NULL;
 			}
-			/* drop caches */
-			prefetch_set_unused(slot->my_ring_index);
-			pgBufferUsage.prefetch.expired += 1;
-			/* make it look like a prefetch cache miss */
-			entry = NULL;
 		}
-	}
 
-	do
-	{
-		if (entry == NULL)
+		do
 		{
-			pgBufferUsage.prefetch.misses += 1;
+			if (entry == NULL)
+			{
+				pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_lsns);
-			slot = GetPrfSlot(ring_index);
-		}
-		else
+				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL);
+				Assert(ring_index != UINT64_MAX);
+				slot = GetPrfSlot(ring_index);
+			}
+			else
+			{
+				/*
+				 * Empty our reference to the prefetch buffer's hash entry. When
+				 * we wait for prefetches, the entry reference is invalidated by
+				 * potential updates to the hash, and when we reconnect to the
+				 * pageserver the prefetch we're waiting for may be dropped, in
+				 * which case we need to retry and take the branch above.
+				 */
+				entry = NULL;
+			}
+
+			Assert(slot->my_ring_index == ring_index);
+			Assert(MyPState->ring_last <= ring_index &&
+				   MyPState->ring_unused > ring_index);
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(GetPrfSlot(ring_index) == slot);
+
+		} while (!prefetch_wait_for(ring_index));
+
+		Assert(slot->status == PRFS_RECEIVED);
+		Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0);
+		Assert(buftag.blockNum == base_blockno + i);
+
+		resp = slot->response;
+
+		switch (resp->tag)
 		{
-			/*
-			 * Empty our reference to the prefetch buffer's hash entry. When
-			 * we wait for prefetches, the entry reference is invalidated by
-			 * potential updates to the hash, and when we reconnect to the
-			 * pageserver the prefetch we're waiting for may be dropped, in
-			 * which case we need to retry and take the branch above.
-			 */
-			entry = NULL;
+			case T_NeonGetPageResponse:
+				memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
+				lfc_write(rinfo, forkNum, blockno, buffer);
+				break;
+
+			case T_NeonErrorResponse:
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								slot->shard_no, blockno, RelFileInfoFmt(rinfo),
+								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+			default:
+				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
 		}
 
-		Assert(slot->my_ring_index == ring_index);
-		Assert(MyPState->ring_last <= ring_index &&
-			   MyPState->ring_unused > ring_index);
-		Assert(slot->status != PRFS_UNUSED);
-		Assert(GetPrfSlot(ring_index) == slot);
-
-	} while (!prefetch_wait_for(ring_index));
-
-	Assert(slot->status == PRFS_RECEIVED);
-
-	resp = slot->response;
-
-	switch (resp->tag)
-	{
-		case T_NeonGetPageResponse:
-			memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
-			lfc_write(rinfo, forkNum, blkno, buffer);
-			break;
-
-		case T_NeonErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							slot->shard_no, blkno,
-							RelFileInfoFmt(rinfo),
-							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-					 errdetail("page server returned error: %s",
-							   ((NeonErrorResponse *) resp)->message)));
-			break;
-		default:
-			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
+		/* buffer was used, clean up for later reuse */
+		prefetch_set_unused(ring_index);
+		prefetch_cleanup_trailing_unused();
 	}
-
-	/* buffer was used, clean up for later reuse */
-	prefetch_set_unused(ring_index);
-	prefetch_cleanup_trailing_unused();
 }
 
 /*
- *	neon_read() -- Read the specified block from a relation.
+ * While function is defined in the neon extension it's used within neon_test_utils directly.
+ * To avoid breaking tests in the runtime please keep function signature in sync.
  */
 void
 #if PG_MAJORVERSION_NUM < 16
+neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+				 neon_request_lsns request_lsns, char *buffer)
+#else
+neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+				 neon_request_lsns request_lsns, void *buffer)
+#endif
+{
+	neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
+}
+
+#if PG_MAJORVERSION_NUM < 17
+/*
+ *	neon_read() -- Read the specified block from a relation.
+ */
+#if PG_MAJORVERSION_NUM < 16
+void
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
 #else
+void
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
@@ -2502,7 +2916,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
 	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -2578,6 +2992,148 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	}
 #endif
 }
+#endif /* PG_MAJORVERSION_NUM <= 16 */
+
+#if PG_MAJORVERSION_NUM >= 17
+void
+neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		void **buffers, BlockNumber nblocks)
+{
+	bits8		read[PG_IOV_MAX / 8];
+	neon_request_lsns request_lsns[PG_IOV_MAX];
+	int			lfc_result;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdreadv(reln, forknum, blocknum, buffers, nblocks);
+			return;
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (nblocks > PG_IOV_MAX)
+		neon_log(ERROR, "Read request too large: %d is larger than max %d",
+				 nblocks, PG_IOV_MAX);
+
+	memset(read, 0, sizeof(read));
+
+	/* Try to read from local file cache */
+	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
+								  nblocks, read);
+
+	/* Read all blocks from LFC, so we're done */
+	if (lfc_result == nblocks)
+		return;
+
+	if (lfc_result == -1)
+	{
+		/* can't use the LFC result, so read all blocks from PS */
+		for (int i = 0; i < PG_IOV_MAX / 8; i++)
+			read[i] = 0xFF;
+	}
+	else
+	{
+		/* invert the result: exclude blocks read from lfc */
+		for (int i = 0; i < PG_IOV_MAX / 8; i++)
+			read[i] = ~(read[i]);
+	}
+
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
+						  request_lsns, nblocks, read);
+
+	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
+					  buffers, nblocks, read);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
+	{
+		char		pageserver_masked[BLCKSZ];
+		char		mdbuf[BLCKSZ];
+		char		mdbuf_masked[BLCKSZ];
+
+		for (int i = 0; i < nblocks; i++)
+		{
+#if PG_MAJORVERSION_NUM >= 17
+			mdreadv(reln, forkNum, blkno + i, &mdbuf, 1);
+#else
+			mdread(reln, forkNum, blkno + i, mdbuf);
+#endif
+
+			memcpy(pageserver_masked, buffer, BLCKSZ);
+			memcpy(mdbuf_masked, mdbuf, BLCKSZ);
+
+			if (PageIsNew((Page) mdbuf))
+			{
+				if (!PageIsNew((Page) pageserver_masked))
+				{
+					neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+						 blkno,
+						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+						 forkNum,
+						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+						 hexdump_page(buffer));
+				}
+			}
+			else if (PageIsNew((Page) buffer))
+			{
+				neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+					 blkno,
+					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(mdbuf));
+			}
+			else if (PageGetSpecialSize(mdbuf) == 0)
+			{
+				/* assume heap */
+				RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
+				RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
+	
+				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+				{
+					neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+						 blkno,
+						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+						 forkNum,
+						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+						 hexdump_page(mdbuf_masked),
+						 hexdump_page(pageserver_masked));
+				}
+			}
+			else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData)))
+			{
+				if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
+				{
+					/* assume btree */
+					RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
+					RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
+	
+					if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+					{
+						neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+							 blkno,
+							 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+							 forkNum,
+							 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+							 hexdump_page(mdbuf_masked),
+							 hexdump_page(pageserver_masked));
+					}
+				}
+			}
+		}
+	}
+#endif
+}
+#endif
 
 #ifdef DEBUG_COMPARE_LOCAL
 static char *
@@ -2623,7 +3179,72 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			if (mdexists(reln, forknum))
 			{
 				/* It exists locally. Guess it's unlogged then. */
+#if PG_MAJORVERSION_NUM >= 17
+				mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+#else
 				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+				/*
+				 * We could set relpersistence now that we have determined
+				 * that it's local. But we don't dare to do it, because that
+				 * would immediately allow reads as well, which shouldn't
+				 * happen. We could cache it with a different 'relpersistence'
+				 * value, but this isn't performance critical.
+				 */
+				return;
+			}
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			#if PG_MAJORVERSION_NUM >= 17
+			mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+			#else
+			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			#endif
+			return;
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	neon_wallog_page(reln, forknum, blocknum, buffer, false);
+
+	lsn = PageGetLSN((Page) buffer);
+	neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+		 forknum, blocknum,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		#if PG_MAJORVERSION_NUM >= 17
+		mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+		#else
+		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+		#endif
+#endif
+}
+
+
+
+#if PG_MAJORVERSION_NUM >= 17
+void
+neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* This is a bit tricky. Check if the relation exists locally */
+			if (mdexists(reln, forknum))
+			{
+				/* It exists locally. Guess it's unlogged then. */
+				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
 
 				/*
 				 * We could set relpersistence now that we have determined
@@ -2641,29 +3262,24 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
 			return;
-
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	neon_wallog_page(reln, forknum, blocknum, buffer, false);
+	neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
 
-	lsn = PageGetLSN((Page) buffer);
-	neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-		 forknum, blocknum,
-		 (uint32) (lsn >> 32), (uint32) lsn);
-
-	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
+	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
-		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+		mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
 #endif
 }
 
+#endif
+
 /*
  *	neon_nblocks() -- Get the number of blocks stored in a relation.
  */
@@ -2699,7 +3315,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
+
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
@@ -2757,7 +3375,9 @@ neon_dbsize(Oid dbNode)
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
 
-	request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
+
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
@@ -2898,6 +3518,38 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 #endif
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+void
+neon_regisersync(SMgrRelation reln, ForkNumber forknum)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdregistersync(reln, forknum);
+			return;
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	neon_log(SmgrTrace, "[NEON_SMGR] registersync noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdimmedsync(reln, forknum);
+#endif
+}
+#endif
+
+
 /*
  * neon_start_unlogged_build() -- Starting build operation on a rel.
  *
@@ -3047,8 +3699,11 @@ neon_end_unlogged_build(SMgrRelation reln)
 static int
 neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
 {
-	XLogRecPtr request_lsn,
-		not_modified_since;
+	XLogRecPtr	request_lsn,
+				not_modified_since;
+	SlruKind	kind;
+	int			n_blocks;
+	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
 
 	/*
 	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
@@ -3078,32 +3733,30 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	 */
 	not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
 
-	SlruKind kind;
-
-    if (STRPREFIX(path, "pg_xact"))
-        kind = SLRU_CLOG;
-    else if (STRPREFIX(path, "pg_multixact/members"))
-        kind = SLRU_MULTIXACT_MEMBERS;
-    else if (STRPREFIX(path, "pg_multixact/offsets"))
-        kind = SLRU_MULTIXACT_OFFSETS;
-    else
-        return -1;
+	if (STRPREFIX(path, "pg_xact"))
+		kind = SLRU_CLOG;
+	else if (STRPREFIX(path, "pg_multixact/members"))
+		kind = SLRU_MULTIXACT_MEMBERS;
+	else if (STRPREFIX(path, "pg_multixact/offsets"))
+		kind = SLRU_MULTIXACT_OFFSETS;
+	else
+		return -1;
 
 	NeonResponse *resp;
 	NeonGetSlruSegmentRequest request = {
 		.req.tag = T_NeonGetSlruSegmentRequest,
 		.req.lsn = request_lsn,
 		.req.not_modified_since = not_modified_since,
-
 		.kind = kind,
 		.segno = segno
 	};
-	int n_blocks;
-	shardno_t shard_no = 0; /* All SLRUs are at shard 0 */
+
 	do
 	{
 		while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
+
 		consume_prefetch_responses();
+
 		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 
@@ -3182,14 +3835,23 @@ static const struct f_smgr neon_smgr =
 #if PG_MAJORVERSION_NUM >= 16
 	.smgr_zeroextend = neon_zeroextend,
 #endif
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_prefetch = neon_prefetch,
+	.smgr_readv = neon_readv,
+	.smgr_writev = neon_writev,
+#else
 	.smgr_prefetch = neon_prefetch,
 	.smgr_read = neon_read,
 	.smgr_write = neon_write,
+#endif
+
 	.smgr_writeback = neon_writeback,
 	.smgr_nblocks = neon_nblocks,
 	.smgr_truncate = neon_truncate,
 	.smgr_immedsync = neon_immedsync,
-
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_registersync = neon_regisersync,
+#endif
 	.smgr_start_unlogged_build = neon_start_unlogged_build,
 	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
 	.smgr_end_unlogged_build = neon_end_unlogged_build,
@@ -3198,11 +3860,11 @@ static const struct f_smgr neon_smgr =
 };
 
 const f_smgr *
-smgr_neon(BackendId backend, NRelFileInfo rinfo)
+smgr_neon(ProcNumber backend, NRelFileInfo rinfo)
 {
 
 	/* Don't use page server for temp relations */
-	if (backend != InvalidBackendId)
+	if (backend != INVALID_PROC_NUMBER)
 		return smgr_standard(backend, rinfo);
 	else
 		return &neon_smgr;
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 65ef588ba5..4d0d06e6de 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -81,6 +81,7 @@ static void nwp_register_gucs(void);
 static void assign_neon_safekeepers(const char *newval, void *extra);
 static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
+static uint64 startup_backpressure_wrap(void);
 static bool backpressure_throttling_impl(void);
 static void walprop_register_bgworker(void);
 
@@ -90,7 +91,7 @@ static void walprop_pg_init_bgworker(void);
 static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
 static void walprop_pg_load_libpqwalreceiver(void);
 
-static process_interrupts_callback_t PrevProcessInterruptsCallback;
+static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL;
 static shmem_startup_hook_type prev_shmem_startup_hook_type;
 #if PG_VERSION_NUM >= 150000
 static shmem_request_hook_type prev_shmem_request_hook = NULL;
@@ -178,7 +179,7 @@ pg_init_walproposer(void)
 
 	nwp_prepare_shmem();
 
-	delay_backend_us = &backpressure_lag_impl;
+	delay_backend_us = &startup_backpressure_wrap;
 	PrevProcessInterruptsCallback = ProcessInterruptsCallback;
 	ProcessInterruptsCallback = backpressure_throttling_impl;
 
@@ -352,6 +353,22 @@ backpressure_lag_impl(void)
 	return 0;
 }
 
+/*
+ * We don't apply backpressure when we're the postmaster, or the startup
+ * process, because in postmaster we can't apply backpressure, and in
+ * the startup process we can't afford to slow down.
+ */
+static uint64
+startup_backpressure_wrap(void)
+{
+	if (AmStartupProcess() || !IsUnderPostmaster)
+		return 0;
+
+	delay_backend_us = &backpressure_lag_impl;
+
+	return backpressure_lag_impl();
+}
+
 /*
  * WalproposerShmemSize --- report amount of shared memory space needed
  */
@@ -401,12 +418,13 @@ WalproposerShmemInit_SyncSafekeeper(void)
 static bool
 backpressure_throttling_impl(void)
 {
-	int64		lag;
+	uint64		lag;
 	TimestampTz start,
 				stop;
-	bool		retry = PrevProcessInterruptsCallback
-		? PrevProcessInterruptsCallback()
-		: false;
+	bool		retry = false;
+
+	if (PointerIsValid(PrevProcessInterruptsCallback))
+		retry = PrevProcessInterruptsCallback();
 
 	/*
 	 * Don't throttle read only transactions or wal sender. Do throttle CREATE
@@ -602,7 +620,12 @@ walprop_pg_init_walsender(void)
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
 	{
+#if PG_MAJORVERSION_NUM >= 17
+		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT,
+							  false, false, false);
+#else
 		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+#endif
 		ReplicationSlotReserveWal();
 		/* Write this slot to disk */
 		ReplicationSlotMarkDirty();
@@ -1509,7 +1532,11 @@ walprop_pg_init_event_set(WalProposer *wp)
 		wpg_log(FATAL, "double-initialization of event set");
 
 	/* for each sk, we have socket plus potentially socket for neon walreader */
+#if PG_MAJORVERSION_NUM >= 17
+	waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers);
+#else
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
+#endif
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c
index f327e132e9..66032c88f6 100644
--- a/pgxn/neon_rmgr/neon_rmgr_decode.c
+++ b/pgxn/neon_rmgr/neon_rmgr_decode.c
@@ -1,6 +1,7 @@
 #include "postgres.h"
 
 #if PG_MAJORVERSION_NUM >= 16
+
 #include "access/heapam_xlog.h"
 #include "access/neon_xlog.h"
 #include "replication/decode.h"
@@ -9,6 +10,10 @@
 
 #include "neon_rmgr.h"
 
+#endif /* PG >= 16 */
+
+#if PG_MAJORVERSION_NUM == 16
+
 /* individual record(group)'s handlers */
 static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
 static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
@@ -399,6 +404,398 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
 	header->t_infomask2 = xlhdr.t_infomask2;
 	header->t_hoff = xlhdr.t_hoff;
 }
+#endif
+
+#if PG_MAJORVERSION_NUM == 17
+
+/* individual record(group)'s handlers */
+static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+
+/* common function to decode tuples */
+static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple);
 
 
-#endif
\ No newline at end of file
+void
+neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8		info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK;
+	TransactionId xid = XLogRecGetXid(buf->record);
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
+
+	/*
+	 * If we don't have snapshot or we are just fast-forwarding, there is no
+	 * point in decoding data changes.
+	 */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
+		ctx->fast_forward)
+		return;
+
+	switch (info)
+	{
+		case XLOG_NEON_HEAP_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonInsert(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_DELETE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonDelete(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_UPDATE:
+		case XLOG_NEON_HEAP_HOT_UPDATE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonUpdate(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_LOCK:
+			break;
+		case XLOG_NEON_HEAP_MULTI_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonMultiInsert(ctx, buf);
+			break;
+		default:
+			elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
+			break;
+	}
+}
+
+static inline bool
+FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id)
+{
+	if (ctx->callbacks.filter_by_origin_cb == NULL)
+		return false;
+
+	return filter_by_origin_cb_wrapper(ctx, origin_id);
+}
+
+/*
+ * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
+ *
+ * Deletes can contain the new tuple.
+ */
+static void
+DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	Size		datalen;
+	char	   *tupledata;
+	Size		tuplelen;
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_insert *xlrec;
+	ReorderBufferChange *change;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_insert *) XLogRecGetData(r);
+
+	/*
+	 * Ignore insert records without new tuples (this does happen when
+	 * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL).
+	 */
+	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
+		return;
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE))
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+	else
+		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT;
+	change->origin_id = XLogRecGetOrigin(r);
+
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	tupledata = XLogRecGetBlockData(r, 0, &datalen);
+	tuplelen = datalen - SizeOfHeapHeader;
+
+	change->data.tp.newtuple =
+		ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+	DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple);
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change,
+							 xlrec->flags & XLH_INSERT_ON_TOAST_RELATION);
+}
+
+/*
+ * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
+ *
+ * Deletes can possibly contain the old primary key.
+ */
+static void
+DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_delete *xlrec;
+	ReorderBufferChange *change;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_delete *) XLogRecGetData(r);
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+
+	if (xlrec->flags & XLH_DELETE_IS_SUPER)
+		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT;
+	else
+		change->action = REORDER_BUFFER_CHANGE_DELETE;
+
+	change->origin_id = XLogRecGetOrigin(r);
+
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	/* old primary key stored */
+	if (xlrec->flags & XLH_DELETE_CONTAINS_OLD)
+	{
+		Size		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader;
+		Size		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader));
+
+		change->data.tp.oldtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete,
+						datalen, change->data.tp.oldtuple);
+	}
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
+}
+
+/*
+ * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
+ * in the record, from wal into proper tuplebufs.
+ *
+ * Updates can possibly contain a new tuple and the old primary key.
+ */
+static void
+DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_update *xlrec;
+	ReorderBufferChange *change;
+	char	   *data;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_update *) XLogRecGetData(r);
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_UPDATE;
+	change->origin_id = XLogRecGetOrigin(r);
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE)
+	{
+		Size		datalen;
+		Size		tuplelen;
+
+		data = XLogRecGetBlockData(r, 0, &datalen);
+
+		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		change->data.tp.newtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple(data, datalen, change->data.tp.newtuple);
+	}
+
+	if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD)
+	{
+		Size		datalen;
+		Size		tuplelen;
+
+		/* caution, remaining data in record is not aligned */
+		data = XLogRecGetData(r) + SizeOfNeonHeapUpdate;
+		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate;
+		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		change->data.tp.oldtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple(data, datalen, change->data.tp.oldtuple);
+	}
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
+}
+
+/*
+ * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
+ *
+ * Currently MULTI_INSERT will always contain the full tuples.
+ */
+static void
+DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_multi_insert *xlrec;
+	int			i;
+	char	   *data;
+	char	   *tupledata;
+	Size		tuplelen;
+	RelFileLocator rlocator;
+
+	xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r);
+
+	/*
+	 * Ignore insert records without new tuples.  This happens when a
+	 * multi_insert is done on a catalog or on a non-persistent relation.
+	 */
+	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
+		return;
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL);
+	if (rlocator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	/*
+	 * We know that this multi_insert isn't for a catalog, so the block should
+	 * always have data even if a full-page write of it is taken.
+	 */
+	tupledata = XLogRecGetBlockData(r, 0, &tuplelen);
+	Assert(tupledata != NULL);
+
+	data = tupledata;
+	for (i = 0; i < xlrec->ntuples; i++)
+	{
+		ReorderBufferChange *change;
+		xl_neon_multi_insert_tuple *xlhdr;
+		int			datalen;
+		HeapTuple	tuple;
+		HeapTupleHeader header;
+
+		change = ReorderBufferGetChange(ctx->reorder);
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+		change->origin_id = XLogRecGetOrigin(r);
+
+		memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator));
+
+		xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data);
+		data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple;
+		datalen = xlhdr->datalen;
+
+		change->data.tp.newtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, datalen);
+
+		tuple = change->data.tp.newtuple;
+		header = tuple->t_data;
+
+		/* not a disk based tuple */
+		ItemPointerSetInvalid(&tuple->t_self);
+
+		/*
+		 * We can only figure this out after reassembling the transactions.
+		 */
+		tuple->t_tableOid = InvalidOid;
+
+		tuple->t_len = datalen + SizeofHeapTupleHeader;
+
+		memset(header, 0, SizeofHeapTupleHeader);
+
+		memcpy((char *) tuple->t_data + SizeofHeapTupleHeader,
+			   (char *) data,
+			   datalen);
+		header->t_infomask = xlhdr->t_infomask;
+		header->t_infomask2 = xlhdr->t_infomask2;
+		header->t_hoff = xlhdr->t_hoff;
+
+		/*
+		 * Reset toast reassembly state only after the last row in the last
+		 * xl_multi_insert_tuple record emitted by one heap_multi_insert()
+		 * call.
+		 */
+		if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI &&
+			(i + 1) == xlrec->ntuples)
+			change->data.tp.clear_toast_afterwards = true;
+		else
+			change->data.tp.clear_toast_afterwards = false;
+
+		ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
+								 buf->origptr, change, false);
+
+		/* move to the next xl_neon_multi_insert_tuple entry */
+		data += datalen;
+	}
+	Assert(data == tupledata + tuplelen);
+}
+
+/*
+ * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
+ * (but not by heap_multi_insert) into a tuplebuf.
+ *
+ * The size 'len' and the pointer 'data' in the record need to be
+ * computed outside as they are record specific.
+ */
+static void
+DecodeXLogTuple(char *data, Size len, HeapTuple tuple)
+{
+	xl_neon_heap_header xlhdr;
+	int			datalen = len - SizeOfNeonHeapHeader;
+	HeapTupleHeader header;
+
+	Assert(datalen >= 0);
+
+	tuple->t_len = datalen + SizeofHeapTupleHeader;
+	header = tuple->t_data;
+
+	/* not a disk based tuple */
+	ItemPointerSetInvalid(&tuple->t_self);
+
+	/* we can only figure this out after reassembling the transactions */
+	tuple->t_tableOid = InvalidOid;
+
+	/* data is not stored aligned, copy to aligned storage */
+	memcpy((char *) &xlhdr,
+		   data,
+		   SizeOfNeonHeapHeader);
+
+	memset(header, 0, SizeofHeapTupleHeader);
+
+	memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader,
+		   data + SizeOfNeonHeapHeader,
+		   datalen);
+
+	header->t_infomask = xlhdr.t_infomask;
+	header->t_infomask2 = xlhdr.t_infomask2;
+	header->t_hoff = xlhdr.t_hoff;
+}
+#endif
diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c
index 4e604a710c..a45e8f5c4a 100644
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -68,8 +68,13 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum);
 static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
 static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo);
+#if PG_MAJORVERSION_NUM >= 17
+static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum, int nblocks);
+#else
 static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum);
+#endif
 #if PG_MAJORVERSION_NUM < 16
 static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, char *buffer, bool skipFsync);
@@ -93,7 +98,9 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
 static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber nblocks);
 static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
-
+#if PG_MAJORVERSION_NUM >= 17
+static void inmem_registersync(SMgrRelation reln, ForkNumber forknum);
+#endif
 
 /*
  *	inmem_init() -- Initialize private state
@@ -190,6 +197,14 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
 {
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static bool
+inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			   int nblocks)
+{
+	return true;
+}
+#else
 /*
  *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
  */
@@ -198,6 +213,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 	return true;
 }
+#endif
 
 /*
  * inmem_writeback() -- Tell the kernel to write pages back to storage.
@@ -211,11 +227,13 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 /*
  *	inmem_read() -- Read the specified block from a relation.
  */
+#if PG_MAJORVERSION_NUM < 16
 static void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
-#if PG_MAJORVERSION_NUM < 16
 		   char *buffer)
 #else
+static void
+inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   void *buffer)
 #endif
 {
@@ -228,6 +246,18 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		memcpy(buffer, page_body[pg], BLCKSZ);
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			void **buffers, BlockNumber nblocks)
+{
+	for (int i = 0; i < nblocks; i++)
+	{
+		inmem_read(reln, forknum, blkno, buffers[i]);
+	}
+}
+#endif
+
 /*
  *	inmem_write() -- Write the supplied block at the appropriate location.
  *
@@ -280,6 +310,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	memcpy(page_body[pg], buffer, BLCKSZ);
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	for (int i = 0; i < nblocks; i++)
+	{
+		inmem_write(reln, forknum, blkno, buffers[i], skipFsync);
+	}
+}
+#endif
+
 /*
  *	inmem_nblocks() -- Get the number of blocks stored in a relation.
  */
@@ -315,6 +357,13 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+inmem_registersync(SMgrRelation reln, ForkNumber forknum)
+{
+}
+#endif
+
 static const struct f_smgr inmem_smgr =
 {
 	.smgr_init = inmem_init,
@@ -328,23 +377,39 @@ static const struct f_smgr inmem_smgr =
 #if PG_MAJORVERSION_NUM >= 16
 	.smgr_zeroextend = inmem_zeroextend,
 #endif
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_prefetch = inmem_prefetch,
+	.smgr_readv = inmem_readv,
+	.smgr_writev = inmem_writev,
+#else
 	.smgr_prefetch = inmem_prefetch,
 	.smgr_read = inmem_read,
 	.smgr_write = inmem_write,
+#endif
 	.smgr_writeback = inmem_writeback,
 	.smgr_nblocks = inmem_nblocks,
 	.smgr_truncate = inmem_truncate,
 	.smgr_immedsync = inmem_immedsync,
+
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_registersync = inmem_registersync,
+#endif
+
+	.smgr_start_unlogged_build = NULL,
+	.smgr_finish_unlogged_build_phase_1 = NULL,
+	.smgr_end_unlogged_build = NULL,
+	.smgr_read_slru_segment = NULL,
 };
 
 const f_smgr *
-smgr_inmem(BackendId backend, NRelFileInfo rinfo)
+smgr_inmem(ProcNumber backend, NRelFileInfo rinfo)
 {
 	Assert(InRecovery);
-	if (backend != InvalidBackendId)
-		return smgr_standard(backend, rinfo);
-	else
-		return &inmem_smgr;
+	// // What does this code do?
+	// if (backend != INVALID_PROC_NUMBER)
+	// 	return smgr_standard(backend, rinfo);
+	// else
+	return &inmem_smgr;
 }
 
 void
diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h
index 58b98b8e6a..91f1c80965 100644
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -11,7 +11,7 @@
 #ifndef INMEM_SMGR_H
 #define INMEM_SMGR_H
 
-extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo);
+extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_inmem(void);
 
 #endif /* INMEM_SMGR_H */
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index cc545393f5..219ca85207 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -100,6 +100,9 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
+#if PG_MAJORVERSION_NUM >= 17
+#include "storage/dsm_registry.h"
+#endif
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
@@ -137,7 +140,7 @@ static BufferTag target_redo_tag;
 
 static XLogReaderState *reader_state;
 
-#define TRACE DEBUG5
+#define TRACE LOG
 
 #ifdef HAVE_LIBSECCOMP
 
@@ -517,6 +520,10 @@ CreateFakeSharedMemoryAndSemaphores()
 	/*
 	 * Set up xlog, clog, and buffers
 	 */
+#if PG_MAJORVERSION_NUM >= 17
+	DSMRegistryShmemInit();
+	VarsupShmemInit();
+#endif
 	XLOGShmemInit();
 	CLOGShmemInit();
 	CommitTsShmemInit();
@@ -566,7 +573,10 @@ CreateFakeSharedMemoryAndSemaphores()
 	/*
 	 * Set up other modules that need some shared memory space
 	 */
+#if PG_MAJORVERSION_NUM < 17
+	/* "snapshot too old" was removed in PG17, and with it the SnapMgr */
 	SnapMgrInit();
+#endif
 	BTreeShmemInit();
 	SyncScanShmemInit();
 	/* Skip due to the 'pg_notify' directory check */
@@ -742,7 +752,7 @@ BeginRedoForBlock(StringInfo input_message)
 		 target_redo_tag.forkNum,
 		 target_redo_tag.blockNum);
 
-	reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT);
+	reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
 		reln->smgr_cached_nblocks[forknum] < blknum + 1)
 	{
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 064a678c96..d8390138c9 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -13,7 +13,7 @@ DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024
 class Lsn:
     """
     Datatype for an LSN. Internally it is a 64-bit integer, but the string
-    representation is like "1/123abcd". See also pg_lsn datatype in Postgres
+    representation is like "1/0123abcd". See also pg_lsn datatype in Postgres
     """
 
     def __init__(self, x: Union[int, str]):
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ee62372871..50284a3f5a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -933,8 +933,11 @@ class NeonEnvBuilder:
 
         for directory_to_clean in reversed(directories_to_clean):
             if not os.listdir(directory_to_clean):
-                log.debug(f"Removing empty directory {directory_to_clean}")
-                directory_to_clean.rmdir()
+                log.info(f"Removing empty directory {directory_to_clean}")
+                try:
+                    directory_to_clean.rmdir()
+                except Exception as e:
+                    log.error(f"Error removing empty directory {directory_to_clean}: {e}")
 
     def cleanup_remote_storage(self):
         for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]:
@@ -3423,6 +3426,7 @@ class VanillaPostgres(PgProtocol):
         assert not self.running
         with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
             conf_file.write("\n".join(options))
+            conf_file.write("\n")
 
     def edit_hba(self, hba: List[str]):
         """Prepend hba lines into pg_hba.conf file."""
@@ -3476,6 +3480,7 @@ def vanilla_pg(
     pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
     port = port_distributor.get_port()
     with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg:
+        vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
         yield vanilla_pg
 
 
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index e12c8e5f4a..258935959b 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -16,6 +16,7 @@ class PgVersion(str, enum.Enum):
     V14 = "14"
     V15 = "15"
     V16 = "16"
+    V17 = "17"
     # Instead of making version an optional parameter in methods, we can use this fake entry
     # to explicitly rely on the default server version (could be different from pg_version fixture value)
     NOT_SET = "<-POSTRGRES VERSION IS NOT SET->"
diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json
new file mode 100644
index 0000000000..7990b2c3a2
--- /dev/null
+++ b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json
@@ -0,0 +1,7 @@
+{
+    "public_extensions": [],
+    "library_index": {
+        "TODO": "We still need PG17 extensions"
+    },
+    "extension_data": {}
+}
\ No newline at end of file
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index b559be5f18..fb5c1d3115 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.workload import Workload
 
@@ -156,6 +156,9 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
+@skip_on_postgres(
+    PgVersion.V17, "There are no snapshots yet"
+)  # TODO: revert this once we have snapshots
 def test_backward_compatibility(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -203,6 +206,9 @@ def test_backward_compatibility(
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
+@skip_on_postgres(
+    PgVersion.V17, "There are no snapshots yet"
+)  # TODO: revert this once we have snapshots
 def test_forward_compatibility(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 27eb05ac09..7370eb1456 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -44,6 +44,8 @@ def test_remote_extensions(
 ):
     if pg_version == PgVersion.V16:
         pytest.skip("TODO: PG16 extension building")
+    if pg_version == PgVersion.V17:
+        pytest.skip("TODO: PG17 extension building")
 
     # setup mock http server
     # that expects request for anon.tar.zst
diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py
index 03e8c7c0df..4145a303c6 100644
--- a/test_runner/regress/test_postgres_version.py
+++ b/test_runner/regress/test_postgres_version.py
@@ -20,16 +20,19 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
         output = f.read().strip()
 
     # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)".
-    pattern = r"postgres \(PostgreSQL\) (?P<version>\d+\.\d+) \((?P<commit>[0-9a-f]{40})\)"
+    # beta- and release candidate releases would use '17beta1' and '18rc2' instead of .-separated numbers.
+    pattern = (
+        r"postgres \(PostgreSQL\) (?P<version>\d+(?:beta|rc|\.)\d+) \((?P<commit>[0-9a-f]{40})\)"
+    )
     match = re.search(pattern, output, re.IGNORECASE)
     assert match is not None, f"Can't parse {output} with {pattern}"
 
     version = match.group("version")
     commit = match.group("commit")
 
-    assert (
-        pg_version.v_prefixed in expected_revisions
-    ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
-
-    msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
-    assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
+    if "." in version:
+        assert (
+            pg_version.v_prefixed in expected_revisions
+        ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
+        msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
+        assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index d152d0f41f..f98b53d966 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -118,6 +118,9 @@ def test_ancestor_detach_branched_from(
         truncated_layers = 0
     elif branchpoint == Branchpoint.AFTER_L0:
         branch_at = Lsn(last_lsn + 8)
+        # make sure the branch point is not on a page header
+        if 0 < (branch_at.lsn_int % 8192) < 40:
+            branch_at += 40
         rows = 8192
         # as there is no 8 byte walrecord, nothing should get copied from the straddling layer
         truncated_layers = 0
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index ea900b07b8..ebe65e7c29 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -1,19 +1,32 @@
 import os
+from pathlib import Path
 
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    PgBin,
+    fork_at_current_lsn,
+    import_timeline_from_vanilla_postgres,
+)
 
 
 #
 # Test branching, when a transaction is in prepared state
 #
-def test_twophase(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"])
+def twophase_test_on_timeline(env: NeonEnv):
+    endpoint = env.endpoints.create_start(
+        "test_twophase", config_lines=["max_prepared_transactions=5"]
+    )
 
     conn = endpoint.connect()
     cur = conn.cursor()
 
+    # FIXME: Switch to the next WAL segment, to work around the bug fixed in
+    # https://github.com/neondatabase/neon/pull/8914.  When that is merged, this can be
+    # removed.
+    cur.execute("select pg_switch_wal()")
+
     cur.execute("CREATE TABLE foo (t text)")
 
     # Prepare a transaction that will insert a row
@@ -53,7 +66,7 @@ def test_twophase(neon_simple_env: NeonEnv):
     assert len(twophase_files) == 2
 
     # Create a branch with the transaction in prepared state
-    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main")
+    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase")
 
     # Start compute on the new branch
     endpoint2 = env.endpoints.create_start(
@@ -80,3 +93,50 @@ def test_twophase(neon_simple_env: NeonEnv):
     # Only one committed insert is visible on the original branch
     cur.execute("SELECT * FROM foo")
     assert cur.fetchall() == [("three",)]
+
+
+def test_twophase(neon_simple_env: NeonEnv):
+    """
+    Test branching, when a transaction is in prepared state
+    """
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_twophase")
+
+    twophase_test_on_timeline(env)
+
+
+def test_twophase_nonzero_epoch(
+    neon_simple_env: NeonEnv,
+    test_output_dir: Path,
+    pg_bin: PgBin,
+    vanilla_pg,
+):
+    """
+    Same as 'test_twophase' test, but with a non-zero XID epoch, i.e. after 4 billion XIDs
+    have been consumed. (This is to ensure that we correctly use the full 64-bit XIDs in
+    pg_twophase filenames with PostgreSQL v17.)
+    """
+    env = neon_simple_env
+
+    # Reset the vanilla Postgres instance with a higher XID epoch
+    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
+    cmd = [pg_resetwal_path, "--epoch=1000000000", "-D", str(vanilla_pg.pgdatadir)]
+    pg_bin.run_capture(cmd)
+
+    timeline_id = TimelineId.generate()
+
+    # Import the cluster to Neon
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+    import_timeline_from_vanilla_postgres(
+        test_output_dir,
+        env,
+        pg_bin,
+        env.initial_tenant,
+        timeline_id,
+        "test_twophase",
+        vanilla_pg.connstr(),
+    )
+    vanilla_pg.stop()  # don't need the original server anymore
+
+    twophase_test_on_timeline(env)
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
new file mode 160000
index 0000000000..9156d63ce2
--- /dev/null
+++ b/vendor/postgres-v17
@@ -0,0 +1 @@
+Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa

From 0a8c5e1214fcd3f59767a6ca4adeb68612977e51 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 13 Sep 2024 15:10:52 +0100
Subject: [PATCH 133/159] Fix broken image for PG17 (#8998)

Most extensions are not required to run Neon-based PostgreSQL, but the
Neon extension is _quite_ critical, so let's make sure we include it.

## Problem

Staging doesn't have working compute images for PG17

## Summary of changes

Disable some PG17 filters so that we get the critical components into the PG17 image
---
 Dockerfile.compute-node | 63 ++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index fe902eb978..6e2510fe60 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -81,10 +81,7 @@ RUN cd postgres && \
 FROM build-deps AS postgis-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt update && \
+RUN apt update && \
     apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
     libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
@@ -92,8 +89,8 @@ RUN case "${PG_VERSION}" in "v17") \
 
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
 RUN case "${PG_VERSION}" in "v17") \
-        mkdir -p /sfcgal && \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    mkdir -p /sfcgal && \
+    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
     esac && \
     wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
     echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
@@ -105,7 +102,7 @@ RUN case "${PG_VERSION}" in "v17") \
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
     esac && \
     wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -666,7 +663,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \
     esac && \
     wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
@@ -687,7 +684,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \
     esac && \
     wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
@@ -707,10 +704,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
         export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
@@ -736,7 +730,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
@@ -769,7 +763,7 @@ USER nonroot
 WORKDIR /home/nonroot
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
     esac && \
     curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
@@ -791,7 +785,7 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
     esac && \
     wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
     echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
@@ -816,7 +810,7 @@ FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
     esac && \
     wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
     echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
@@ -839,7 +833,7 @@ ARG PG_VERSION
 
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
     esac && \
     wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
@@ -861,7 +855,7 @@ FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
     esac && \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
     echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
@@ -883,7 +877,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \
     esac && \
     wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
@@ -903,7 +897,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \
     esac && \
     wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
@@ -924,7 +918,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_partman doesn't support PG17 yet" && exit 0;; \
     esac && \
     wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
@@ -977,10 +971,7 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon \
         -s install && \
@@ -1023,10 +1014,7 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
 
 #########################################################################################
 #
@@ -1047,24 +1035,15 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
+RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
 
 # Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    rm -r /usr/local/pgsql/include
+RUN rm -r /usr/local/pgsql/include
 
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    rm /usr/local/pgsql/lib/lib*.a
+RUN rm /usr/local/pgsql/lib/lib*.a
 
 
 #########################################################################################

From b2c83db54d58d46e8ca11d5f1b4a38471322f713 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 16 Sep 2024 12:44:26 +0100
Subject: [PATCH 134/159] CI(gather-rust-build-stats): set PQ_LIB_DIR to
 Postgres 17 (#9001)

## Problem

`gather-rust-build-stats` extra CI job fails with
```
"PQ_LIB_DIR" doesn't exist in the configured path: "/__w/neon/neon/pg_install/v16/lib"
```

## Summary of changes
- Use the path to Postgres 17 for the `gather-rust-build-stats` job.
The job uses Postgres built by `make walproposer-lib`
---
 .github/workflows/neon_extra_builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 41c9f5dee5..140aac032a 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -181,7 +181,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats

From 5876c441abc973acca60882192ad46333c075abd Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 10 Sep 2024 16:46:32 +0100
Subject: [PATCH 135/159] Grant access to pg_show_replication_origin_status for
 neon_superuser

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 ...ant_pg_show_replication_origin_status_to_neon_superuser.sql | 1 +
 compute_tools/src/spec.rs                                      | 3 +++
 test_runner/regress/test_migrations.py                         | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql

diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
new file mode 100644
index 0000000000..425ed8cd3d
--- /dev/null
+++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 6a87263821..aa9405d28d 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -793,6 +793,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!(
             "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
         ),
+        include_str!(
+            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
+        ),
     ];
 
     MigrationRunner::new(client, &migrations).run_migrations()?;
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index e88e56d030..7211619a99 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -14,7 +14,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
-    num_migrations = 10
+    num_migrations = 11
     endpoint.wait_for_migrations(num_migrations=num_migrations)
 
     with endpoint.cursor() as cur:

From c8bedca5821694633d140a1da2baa4fa9c7dea0c Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 16 Sep 2024 18:06:31 +0200
Subject: [PATCH 136/159] Fix PG17's extension modifications (#9010)

This also reduces the GRANT statements to one per created _reset
function
---
 Dockerfile.compute-node | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 6e2510fe60..6bf6fb650f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -55,22 +55,27 @@ RUN cd postgres && \
     # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
     # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
     # so we do it here.
-    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
-    # the first loop is for pg_stat_statement extension version <= 1.6
     for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
         filename=$(basename "$file"); \
-        if echo "$old_list" | grep -q -F "$filename"; then \
+        # Note that there are no downgrade scripts for pg_stat_statements, so we \
+        # don't have to modify any downgrade paths or (much) older versions: we only \
+        # have to make sure every creation of the pg_stat_statements_reset function \
+        # also adds execute permissions to the neon_superuser.
+        case $filename in \
+          pg_stat_statements--1.4.sql) \
+            # pg_stat_statements_reset is first created with 1.4
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-        fi; \
-    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7,
-    # where pg_stat_statement_reset() got 3 additional arguments
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if ! echo "$old_list" | grep -q -F "$filename"; then \
+            ;; \
+          pg_stat_statements--1.6--1.7.sql) \
+            # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-        fi; \
-    done
+            ;; \
+          pg_stat_statements--1.10--1.11.sql) \
+            # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
+            ;; \
+        esac; \
+    done;
 
 #########################################################################################
 #

From 2bbb4d3e1c4d70d5bcdc972eb2b9863d1073338a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 16 Sep 2024 21:45:19 +0300
Subject: [PATCH 137/159] Remove misc unused code (#9014)

---
 libs/postgres_ffi/src/xlog_utils.rs |  9 +-------
 libs/utils/src/accum.rs             | 33 -----------------------------
 libs/utils/src/id.rs                | 10 ---------
 libs/utils/src/lib.rs               |  7 ------
 libs/utils/src/lsn.rs               |  9 --------
 libs/utils/src/nonblock.rs          | 17 ---------------
 libs/utils/src/shutdown.rs          |  7 ------
 7 files changed, 1 insertion(+), 91 deletions(-)
 delete mode 100644 libs/utils/src/accum.rs
 delete mode 100644 libs/utils/src/nonblock.rs
 delete mode 100644 libs/utils/src/shutdown.rs

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 0cfd56962e..1873734753 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -30,7 +30,7 @@ use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
 use std::io::SeekFrom;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
@@ -260,13 +260,6 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
     }
 }
 
-pub fn main() {
-    let mut data_dir = PathBuf::new();
-    data_dir.push(".");
-    let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
-    println!("wal_end={:?}", wal_end);
-}
-
 impl XLogRecord {
     pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
         use utils::bin_ser::LeSer;
diff --git a/libs/utils/src/accum.rs b/libs/utils/src/accum.rs
deleted file mode 100644
index 0fb0190a92..0000000000
--- a/libs/utils/src/accum.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
-/// feed the accumulated values by calling the 'accum' function, instead of having an
-/// iterator.
-///
-/// For example, to calculate the smallest value among some integers:
-///
-/// ```
-/// use utils::accum::Accum;
-///
-/// let values = [1, 2, 3];
-///
-/// let mut min_value: Accum<u32> = Accum(None);
-/// for new_value in &values {
-///     min_value.accum(std::cmp::min, *new_value);
-/// }
-///
-/// assert_eq!(min_value.0.unwrap(), 1);
-/// ```
-pub struct Accum<T>(pub Option<T>);
-impl<T: Copy> Accum<T> {
-    pub fn accum<F>(&mut self, func: F, new_value: T)
-    where
-        F: FnOnce(T, T) -> T,
-    {
-        // If there is no previous value, just store the new value.
-        // Otherwise call the function to decide which one to keep.
-        self.0 = Some(if let Some(accum) = self.0 {
-            func(accum, new_value)
-        } else {
-            new_value
-        });
-    }
-}
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index 2cda899b15..eb91839504 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -88,12 +88,6 @@ impl<'de> Deserialize<'de> for Id {
 }
 
 impl Id {
-    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
-        let mut arr = [0u8; 16];
-        buf.copy_to_slice(&mut arr);
-        Id::from(arr)
-    }
-
     pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
         if src.len() != 16 {
             return Err(IdError::SliceParseError(src.len()));
@@ -179,10 +173,6 @@ impl fmt::Debug for Id {
 macro_rules! id_newtype {
     ($t:ident) => {
         impl $t {
-            pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
-                $t(Id::get_from_buf(buf))
-            }
-
             pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
                 Ok($t(Id::from_slice(src)?))
             }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 218dd468b1..03fb36caf8 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -43,16 +43,9 @@ pub mod logging;
 pub mod lock_file;
 pub mod pid_file;
 
-// Misc
-pub mod accum;
-pub mod shutdown;
-
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;
 
-// Utility for putting a raw file descriptor into non-blocking mode
-pub mod nonblock;
-
 // Default signal handling
 pub mod sentry_init;
 pub mod signals;
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 1aebe91428..06d5c27ebf 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,6 +1,5 @@
 #![warn(missing_docs)]
 
-use camino::Utf8Path;
 use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
@@ -145,14 +144,6 @@ impl Lsn {
         i128::from(self.0) - i128::from(other)
     }
 
-    /// Parse an LSN from a filename in the form `0000000000000000`
-    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
-    where
-        F: AsRef<Utf8Path>,
-    {
-        Lsn::from_hex(filename.as_ref().as_str())
-    }
-
     /// Parse an LSN from a string in the form `0000000000000000`
     pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
     where
diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs
deleted file mode 100644
index 05e2e3af4c..0000000000
--- a/libs/utils/src/nonblock.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
-use std::os::unix::io::RawFd;
-
-/// Put a file descriptor into non-blocking mode
-pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
-    let bits = fcntl(fd, F_GETFL)?;
-
-    // If F_GETFL returns some unknown bits, they should be valid
-    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
-    // would effectively clear them, which is not what we want.
-    let mut flags = OFlag::from_bits_retain(bits);
-    flags |= OFlag::O_NONBLOCK;
-
-    fcntl(fd, F_SETFL(flags))?;
-
-    Ok(())
-}
diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs
deleted file mode 100644
index cb5a44d664..0000000000
--- a/libs/utils/src/shutdown.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Immediately terminate the calling process without calling
-/// atexit callbacks, C runtime destructors etc. We mainly use
-/// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) -> ! {
-    // SAFETY: exiting is safe, the ffi is not safe
-    unsafe { nix::libc::_exit(code as _) };
-}

From 5e16c7bb0b484cc760f4ddb3c0a866157760825e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 10 Sep 2024 15:37:08 +0100
Subject: [PATCH 138/159] Generate pgbench data on the server for most tests

This should generally be faster when running tests, especially those
that run with higher scales.

Ignoring test_lfc_resize since it seems like we are hitting a query
timeout for some reason that I have yet to investigate. A little bit of
improvemnt is better than none.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/performance/test_branch_creation.py    |  2 +-
 test_runner/performance/test_branching.py          |  2 +-
 .../performance/test_logical_replication.py        | 14 +++++++-------
 .../performance/test_physical_replication.py       |  4 ++--
 test_runner/regress/test_branching.py              |  2 +-
 test_runner/regress/test_disk_usage_eviction.py    |  2 +-
 test_runner/regress/test_hot_standby.py            |  2 +-
 test_runner/regress/test_pageserver_reconnect.py   |  2 +-
 .../test_pageserver_restarts_under_workload.py     |  2 +-
 .../regress/test_threshold_based_eviction.py       |  2 +-
 10 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index b3866f1813..f1ab7876f9 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -107,7 +107,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     env.neon_cli.create_branch("b0")
 
     endpoint = env.endpoints.create_start("b0")
-    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
+    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()])
 
     branch_creation_durations = []
 
diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py
index 667d1a4c4a..f8d39487f2 100644
--- a/test_runner/performance/test_branching.py
+++ b/test_runner/performance/test_branching.py
@@ -43,7 +43,7 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
 
     env.neon_cli.create_branch("root")
     endpoint_root = env.endpoints.create_start("root")
-    pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", endpoint_root.connstr(), "-s10"])
 
     fork_at_current_lsn(env, endpoint_root, "child", "root")
 
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 29a0380524..dbf94a2cf5 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -24,13 +24,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
 
     endpoint = env.endpoints.create_start("main")
 
-    pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()])
 
     endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
 
     # now start subscriber
     vanilla_pg.start()
-    pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", vanilla_pg.connstr()])
 
     vanilla_pg.safe_psql("truncate table pgbench_accounts")
     vanilla_pg.safe_psql("truncate table pgbench_history")
@@ -99,9 +99,9 @@ def test_subscriber_lag(
     sub_connstr = benchmark_project_sub.connstr
 
     if benchmark_project_pub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env)
     if benchmark_project_sub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
@@ -193,8 +193,8 @@ def test_publisher_restart(
     pub_connstr = benchmark_project_pub.connstr
     sub_connstr = benchmark_project_sub.connstr
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
@@ -288,7 +288,7 @@ def test_snap_files(
             is_super = cur.fetchall()[0][0]
             assert is_super, "This benchmark won't work if we don't have superuser"
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)
 
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 7e16197211..49b1176d34 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -85,7 +85,7 @@ def test_ro_replica_lag(
             endpoint_id=replica["endpoint"]["id"],
         )["uri"]
 
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=master_env)
 
         master_workload = pg_bin.run_nonblocking(
             ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
@@ -212,7 +212,7 @@ def test_replication_start_stop(
         for i in range(num_replicas):
             replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
 
-        pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)
 
         # Sync replicas
         with psycopg2.connect(master_connstr) as conn_master:
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index fc74707639..1729e2fc98 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -52,7 +52,7 @@ def test_branching_with_pgbench(
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
 
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", "-T15", connstr])
 
     env.neon_cli.create_branch("b0", tenant_id=tenant)
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 85616c3fe2..1fec8b3f18 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -291,7 +291,7 @@ def pgbench_init_tenant(
     )
 
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", endpoint.connstr()])
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     return (tenant_id, timeline_id)
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index ae63136abb..d94704012f 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -199,7 +199,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
 def run_pgbench(connstr: str, pg_bin: PgBin):
     log.info(f"Start a pgbench workload on pg {connstr}")
     # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr])
     log.info("pgbench init done")
     pg_bin.run_capture(["pgbench", "-T60", connstr])
 
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
index 37ff923632..ada6da98ff 100644
--- a/test_runner/regress/test_pageserver_reconnect.py
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -22,7 +22,7 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py
index 65569f3bac..9bb9b373ad 100644
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -19,7 +19,7 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 840c7159ad..094dd20529 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -106,7 +106,7 @@ def test_threshold_based_eviction(
 
     # create a bunch of layers
     with env.endpoints.create_start("main", tenant_id=tenant_id) as pg:
-        pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()])
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s", "3", pg.connstr()])
         last_flush_lsn_upload(env, pg, tenant_id, timeline_id)
     # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint
     for sk in env.safekeepers:

From 3a52e356c18440cb3b685d2cf1848fd3a3c86c67 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:46:58 +0300
Subject: [PATCH 139/159] Remove unused function (#9018)

---
 test_runner/fixtures/neon_fixtures.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 50284a3f5a..5f7a32782c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -182,25 +182,6 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
     yield output_dir
 
 
-@pytest.fixture(scope="function")
-def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
-    versioned_dir = pg_distrib_dir / pg_version.v_prefixed
-
-    psql_bin_path = versioned_dir / "bin/psql"
-    postgres_bin_path = versioned_dir / "bin/postgres"
-
-    if os.getenv("REMOTE_ENV"):
-        # When testing against a remote server, we only need the client binary.
-        if not psql_bin_path.exists():
-            raise Exception(f"psql not found at '{psql_bin_path}'")
-    else:
-        if not postgres_bin_path.exists():
-            raise Exception(f"postgres not found at '{postgres_bin_path}'")
-
-    log.info(f"versioned_pg_distrib_dir is {versioned_dir}")
-    yield versioned_dir
-
-
 @pytest.fixture(scope="session")
 def neon_api_key() -> str:
     api_key = os.getenv("NEON_API_KEY")

From fec9321fc04434f84f86d166a6d89c4421110c77 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 02:23:49 +0300
Subject: [PATCH 140/159] Use Path type in a few more places in
 neon_fixtures.py (#9018)

This is in preparation of replacing neon_fixtures.get_dir_size with
neon_fixtures.utils.get_dir_size() in next commit.
---
 test_runner/fixtures/compare_fixtures.py |  5 ++--
 test_runner/fixtures/neon_fixtures.py    | 32 ++++++++++++------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 7c4a8db36f..770b32b11e 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
 from contextlib import _GeneratorContextManager, contextmanager
 
 # Type-related stuff
+from pathlib import Path
 from typing import Dict, Iterator, List
 
 import pytest
@@ -229,11 +230,11 @@ class VanillaCompare(PgCompare):
         pass  # TODO find something
 
     def report_size(self):
-        data_size = self.pg.get_subdir_size("base")
+        data_size = self.pg.get_subdir_size(Path("base"))
         self.zenbenchmark.record(
             "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
-        wal_size = self.pg.get_subdir_size("pg_wal")
+        wal_size = self.pg.get_subdir_size(Path("pg_wal"))
         self.zenbenchmark.record(
             "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5f7a32782c..90a351cdb3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -224,7 +224,7 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
     return BASE_PORT + worker_seq_no * worker_port_num
 
 
-def get_dir_size(path: str) -> int:
+def get_dir_size(path: Path) -> int:
     """Return size in bytes."""
     totalbytes = 0
     for root, _dirs, files in os.walk(path):
@@ -3319,12 +3319,12 @@ class PgBin:
         )
         return base_path
 
-    def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
+    def get_pg_controldata_checkpoint_lsn(self, pgdata: Path) -> Lsn:
         """
         Run pg_controldata on given datadir and extract checkpoint lsn.
         """
 
-        pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
+        pg_controldata_path = self.pg_bin_path / "pg_controldata"
         cmd = f"{pg_controldata_path} -D {pgdata}"
         result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
         checkpoint_lsn = re.findall(
@@ -3433,9 +3433,9 @@ class VanillaPostgres(PgProtocol):
         self.running = False
         self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
 
-    def get_subdir_size(self, subdir) -> int:
+    def get_subdir_size(self, subdir: Path) -> int:
         """Return size of pgdatadir subdirectory in bytes."""
-        return get_dir_size(os.path.join(self.pgdatadir, subdir))
+        return get_dir_size(self.pgdatadir / subdir)
 
     def __enter__(self) -> "VanillaPostgres":
         return self
@@ -3962,7 +3962,7 @@ class Endpoint(PgProtocol, LogUtils):
         self.env = env
         self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
-        self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
+        self.pgdata_dir: Optional[Path] = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
         self.pg_port = pg_port
         self.http_port = http_port
@@ -4019,7 +4019,7 @@ class Endpoint(PgProtocol, LogUtils):
             allow_multiple=allow_multiple,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
-        self.pgdata_dir = os.path.join(self.env.repo_dir, path)
+        self.pgdata_dir = self.env.repo_dir / path
         self.logfile = self.endpoint_path() / "compute.log"
 
         config_lines = config_lines or []
@@ -4072,21 +4072,21 @@ class Endpoint(PgProtocol, LogUtils):
         path = Path("endpoints") / self.endpoint_id
         return self.env.repo_dir / path
 
-    def pg_data_dir_path(self) -> str:
+    def pg_data_dir_path(self) -> Path:
         """Path to Postgres data directory"""
-        return os.path.join(self.endpoint_path(), "pgdata")
+        return self.endpoint_path() / "pgdata"
 
-    def pg_xact_dir_path(self) -> str:
+    def pg_xact_dir_path(self) -> Path:
         """Path to pg_xact dir"""
-        return os.path.join(self.pg_data_dir_path(), "pg_xact")
+        return self.pg_data_dir_path() / "pg_xact"
 
-    def pg_twophase_dir_path(self) -> str:
+    def pg_twophase_dir_path(self) -> Path:
         """Path to pg_twophase dir"""
-        return os.path.join(self.pg_data_dir_path(), "pg_twophase")
+        return self.pg_data_dir_path() / "pg_twophase"
 
-    def config_file_path(self) -> str:
+    def config_file_path(self) -> Path:
         """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
-        return os.path.join(self.endpoint_path(), "postgresql.conf")
+        return self.endpoint_path() / "postgresql.conf"
 
     def config(self, lines: List[str]) -> "Endpoint":
         """
@@ -4251,7 +4251,7 @@ class Endpoint(PgProtocol, LogUtils):
         log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
         self.safe_psql("checkpoint")
         assert self.pgdata_dir is not None  # please mypy
-        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
+        return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
 
     def clear_shared_buffers(self, cursor: Optional[Any] = None):
         """

From c6f56b8462e16284b77a3176801c9ed7364df04b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:15 +0300
Subject: [PATCH 141/159] Remove redundant get_dir_size() function (#9018)

There was another copy of it in utils.py. The only difference is that
the version in utils.py tolerates files that are concurrently
removed. That seems fine for the few callers in neon_fixtures.py too.
---
 test_runner/fixtures/neon_fixtures.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 90a351cdb3..92dcd1e3cd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -93,6 +93,7 @@ from fixtures.utils import (
     allure_add_grafana_links,
     allure_attach_from_dir,
     assert_no_errors,
+    get_dir_size,
     get_self_dir,
     print_gc_result,
     subprocess_capture,
@@ -224,16 +225,6 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
     return BASE_PORT + worker_seq_no * worker_port_num
 
 
-def get_dir_size(path: Path) -> int:
-    """Return size in bytes."""
-    totalbytes = 0
-    for root, _dirs, files in os.walk(path):
-        for name in files:
-            totalbytes += os.path.getsize(os.path.join(root, name))
-
-    return totalbytes
-
-
 @pytest.fixture(scope="session")
 def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)

From 4295ff0f071789c22d0a838f5512e5caba57d54a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:21 +0300
Subject: [PATCH 142/159] Mark a couple of test fixtures as session-scoped
 (#9018)

pg_distrib_dir doesn't include the Postgres version and only depends
on env variables which cannot change during a test run, so it can be
marked as session-scoped. Similarly, the platform cannot change during
a test run.
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 test_runner/fixtures/parametrize.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 92dcd1e3cd..d2caee5992 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -159,7 +159,7 @@ def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]:
     yield binpath
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="session")
 def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
     if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"):
         distrib_dir = Path(env_postgres_bin).resolve()
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index e2dd51802c..2c8e71526c 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -24,7 +24,7 @@ def build_type() -> Optional[str]:
     return None
 
 
-@pytest.fixture(scope="function", autouse=True)
+@pytest.fixture(scope="session", autouse=True)
 def platform() -> Optional[str]:
     return None
 

From 2db840d8b8736530a9653719ed2560e3647539a0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:26 +0300
Subject: [PATCH 143/159] Move a few test functions related to auth tokens to
 separate file (#9018)

For readability. neon_fixtures.py is huge.
---
 test_runner/fixtures/auth_tokens.py           | 47 +++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         | 40 +---------------
 .../regress/test_storage_controller.py        |  2 +-
 3 files changed, 49 insertions(+), 40 deletions(-)
 create mode 100644 test_runner/fixtures/auth_tokens.py

diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py
new file mode 100644
index 0000000000..8ebaf61e5e
--- /dev/null
+++ b/test_runner/fixtures/auth_tokens.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+
+import jwt
+
+from fixtures.common_types import TenantId
+
+
+@dataclass
+class AuthKeys:
+    priv: str
+
+    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
+        token_data = {key: str(val) for key, val in token_data.items()}
+        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
+        # cast(Any, self.priv)
+
+        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
+        # hinting or something (not sure what). If it returned 'bytes', convert it to 'str'
+        # explicitly.
+        if isinstance(token, bytes):
+            token = token.decode()
+
+        return token
+
+    def generate_pageserver_token(self) -> str:
+        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
+
+    def generate_safekeeper_token(self) -> str:
+        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
+
+    # generate token giving access to only one tenant
+    def generate_tenant_token(self, tenant_id: TenantId) -> str:
+        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TokenScope(str, Enum):
+    ADMIN = "admin"
+    PAGE_SERVER_API = "pageserverapi"
+    GENERATIONS_API = "generations_api"
+    SAFEKEEPER_DATA = "safekeeperdata"
+    TENANT = "tenant"
+    SCRUBBER = "scrubber"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d2caee5992..93b93ff019 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -43,7 +43,6 @@ from urllib.parse import quote, urlparse
 import asyncpg
 import backoff
 import httpx
-import jwt
 import psycopg2
 import psycopg2.sql
 import pytest
@@ -60,6 +59,7 @@ from psycopg2.extensions import make_dsn, parse_dsn
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
+from fixtures.auth_tokens import AuthKeys, TokenScope
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
@@ -373,44 +373,6 @@ class PgProtocol:
         return self.safe_psql(query, log_query=log_query)[0][0]
 
 
-@dataclass
-class AuthKeys:
-    priv: str
-
-    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
-        token_data = {key: str(val) for key, val in token_data.items()}
-        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
-        # cast(Any, self.priv)
-
-        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
-        # hinting or something (not sure what). If it returned 'bytes', convert it to 'str'
-        # explicitly.
-        if isinstance(token, bytes):
-            token = token.decode()
-
-        return token
-
-    def generate_pageserver_token(self) -> str:
-        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
-
-    def generate_safekeeper_token(self) -> str:
-        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
-
-    # generate token giving access to only one tenant
-    def generate_tenant_token(self, tenant_id: TenantId) -> str:
-        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
-
-
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class TokenScope(str, Enum):
-    ADMIN = "admin"
-    PAGE_SERVER_API = "pageserverapi"
-    GENERATIONS_API = "generations_api"
-    SAFEKEEPER_DATA = "safekeeperdata"
-    TENANT = "tenant"
-    SCRUBBER = "scrubber"
-
-
 class NeonEnvBuilder:
     """
     Builder object to create a Neon runtime environment
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index eea05d7548..2e21f8fb46 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -7,6 +7,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import pytest
+from fixtures.auth_tokens import TokenScope
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
@@ -18,7 +19,6 @@ from fixtures.neon_fixtures import (
     PgBin,
     StorageControllerApiException,
     StorageControllerLeadershipStatus,
-    TokenScope,
     last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient

From b719d58863f90beb46218d1167e4236b05910924 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 17 Sep 2024 09:25:42 +0100
Subject: [PATCH 144/159] storcon: forward requests from stepped down instance
 to the current leader (#8954)

## Problem
It turns out that we can't rely on external orchestration to promptly
route trafic to the new leader. This is downtime inducing.
Forwarding provides a safe way out.

## Safety
We forward when:
1. Request is not one of ["/control/v1/step_down", "/status", "/ready",
"/metrics"]
2. Current instance is in [`LeadershipStatus::SteppedDown`] state
3. There is a leader in the database to forward to
4. Leader from step (3) is not the current instance

If a storcon instance is persisted in the database, then we know that it
is the current leader.
There's one exception: time between handling step-down request and the
new leader updating the
database.

Let's treat the happy case first. The stepped down node does not produce
any side effects,
since all request handling happens on the leader.

As for the edge case, we are guaranteed to always have a maximum of two
running instances.
Hence, if we are in the edge case scenario the leader persisted in the
database is the
stepped down instance that received the request. Condition (4) above
covers this scenario.

## Summary of changes
* Conversion utilities for reqwest <-> hyper. I'm not happy with these,
but I don't see a better way. Open to suggestions.
* Add request forwarding logic
* Update each request handler. Again, not happy with this. If anyone
knows a nice to wrap the handlers, lmk. Me and Joonas tried :/
* Update each handler to maybe forward
* Tweak tests to showcase new behaviour
---
 storage_controller/src/http.rs                | 620 +++++++++++++++++-
 .../regress/test_storage_controller.py        |  22 +-
 2 files changed, 607 insertions(+), 35 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index a6638f5191..1745bf5575 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1,10 +1,11 @@
+use crate::http;
 use crate::metrics::{
     HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
     METRICS_REGISTRY,
 };
 use crate::persistence::SafekeeperPersistence;
 use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
@@ -22,6 +23,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -87,9 +89,16 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 }
 
 /// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_re_attach(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -97,9 +106,16 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.validate(validate_req).await?)
@@ -108,9 +124,16 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// Call into this before attaching a tenant to a pageserver, to acquire a generation number
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
-async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_attach_hook(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -124,9 +147,16 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
     )
 }
 
-async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_inspect(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let inspect_req = json_request::<InspectRequest>(&mut req).await?;
 
     let state = get_state(&req);
@@ -136,10 +166,17 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
 
 async fn handle_tenant_create(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
 
     json_response(
@@ -150,11 +187,18 @@ async fn handle_tenant_create(
 
 async fn handle_tenant_location_config(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
@@ -166,10 +210,17 @@ async fn handle_tenant_location_config(
 
 async fn handle_tenant_config_set(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
 
     json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
@@ -182,16 +233,30 @@ async fn handle_tenant_config_get(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
 }
 
 async fn handle_tenant_time_travel_remote_storage(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
 
     let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -232,6 +297,13 @@ async fn handle_tenant_secondary_download(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
     json_response(map_reqwest_hyper_status(status)?, progress)
 }
@@ -243,6 +315,13 @@ async fn handle_tenant_delete(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     let status_code = service
         .tenant_delete(tenant_id)
         .await
@@ -258,11 +337,18 @@ async fn handle_tenant_delete(
 
 async fn handle_tenant_timeline_create(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
     json_response(
         StatusCode::CREATED,
@@ -277,9 +363,16 @@ async fn handle_tenant_timeline_delete(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
 
     // For timeline deletions, which both implement an "initially return 202, then 404 once
     // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
@@ -337,12 +430,19 @@ async fn handle_tenant_timeline_delete(
 
 async fn handle_tenant_timeline_archival_config(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
 
     let create_req = json_request::<TimelineArchivalConfigRequest>(&mut req).await?;
 
@@ -358,9 +458,16 @@ async fn handle_tenant_timeline_detach_ancestor(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
 
     let res = service
         .tenant_timeline_detach_ancestor(tenant_id, timeline_id)
@@ -393,6 +500,13 @@ async fn handle_tenant_timeline_passthrough(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let Some(path) = req.uri().path_and_query() else {
         // This should never happen, our request router only calls us if there is a path
         return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
@@ -460,9 +574,17 @@ async fn handle_tenant_locate(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
     check_permissions(&req, Scope::Admin)?;
 
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
@@ -473,6 +595,14 @@ async fn handle_tenant_describe(
     check_permissions(&req, Scope::Scrubber)?;
 
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
@@ -482,12 +612,26 @@ async fn handle_tenant_list(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_list())
 }
 
-async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
     let state = get_state(&req);
     state.service.node_register(register_req).await?;
@@ -497,6 +641,13 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let nodes = state.service.node_list().await?;
     let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
@@ -507,6 +658,13 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
@@ -515,14 +673,28 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
 }
 
-async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_configure(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
     if node_id != config_req.node_id {
@@ -548,6 +720,13 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -570,6 +749,13 @@ async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiErr
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let leader = state.service.get_leader().await.map_err(|err| {
         ApiError::InternalServerError(anyhow::anyhow!(
@@ -583,6 +769,13 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -594,6 +787,13 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -605,6 +805,13 @@ async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -616,6 +823,13 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -624,9 +838,16 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
-async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Scrubber)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -640,6 +861,13 @@ async fn handle_metadata_health_list_unhealthy(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
 
@@ -652,10 +880,17 @@ async fn handle_metadata_health_list_unhealthy(
 }
 
 async fn handle_metadata_health_list_outdated(
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
     let state = get_state(&req);
     let health_records = state
@@ -671,10 +906,17 @@ async fn handle_metadata_health_list_outdated(
 
 async fn handle_tenant_shard_split(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
 
@@ -686,10 +928,17 @@ async fn handle_tenant_shard_split(
 
 async fn handle_tenant_shard_migrate(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
     json_response(
@@ -700,9 +949,16 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
-async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
     let state = get_state(&req);
@@ -716,9 +972,16 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
-async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -731,23 +994,46 @@ async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.step_down().await)
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
 async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(
@@ -759,6 +1045,13 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     state.service.tenants_dump()
 }
@@ -766,6 +1059,13 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     state.service.scheduler_dump()
 }
@@ -773,6 +1073,13 @@ async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, Api
 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -781,19 +1088,40 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
 }
 
 /// Status endpoint is just used for checking that our HTTP listener is up
-async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, ())
 }
 
 /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
 /// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
 async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     if state.service.startup_complete.is_ready() {
         json_response(StatusCode::OK, ())
@@ -816,6 +1144,13 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
 
     let id = parse_request_param::<i64>(&req, "id")?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     let res = state.service.get_safekeeper(id).await;
@@ -847,6 +1182,13 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
         )));
     }
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     state.service.upsert_safekeeper(body).await?;
@@ -925,10 +1267,7 @@ pub fn prologue_leadership_status_check_middleware<
 
         let allowed_routes = match leadership_status {
             LeadershipStatus::Leader => AllowedRoutes::All,
-            LeadershipStatus::SteppedDown => {
-                // TODO: does it make sense to allow /status here?
-                AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
-            }
+            LeadershipStatus::SteppedDown => AllowedRoutes::All,
             LeadershipStatus::Candidate => {
                 AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
             }
@@ -1005,6 +1344,13 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
 pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
     let response = Response::builder()
@@ -1032,6 +1378,220 @@ where
     request_span(request, handler).await
 }
 
+enum ForwardOutcome {
+    Forwarded(Result<Response<Body>, ApiError>),
+    NotForwarded(Request<Body>),
+}
+
+/// Potentially forward the request to the current storage controler leader.
+/// More specifically we forward when:
+/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"]
+/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state
+/// 3. There is a leader in the database to forward to
+/// 4. Leader from step (3) is not the current instance
+///
+/// Why forward?
+/// It turns out that we can't rely on external orchestration to promptly route trafic to the
+/// new leader. This is downtime inducing. Forwarding provides a safe way out.
+///
+/// Why is it safe?
+/// If a storcon instance is persisted in the database, then we know that it is the current leader.
+/// There's one exception: time between handling step-down request and the new leader updating the
+/// database.
+///
+/// Let's treat the happy case first. The stepped down node does not produce any side effects,
+/// since all request handling happens on the leader.
+///
+/// As for the edge case, we are guaranteed to always have a maximum of two running instances.
+/// Hence, if we are in the edge case scenario the leader persisted in the database is the
+/// stepped down instance that received the request. Condition (4) above covers this scenario.
+async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
+    const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"];
+
+    let uri = req.uri().to_string();
+    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
+
+    let state = get_state(&req);
+    let leadership_status = state.service.get_leadership_status();
+
+    if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward {
+        return ForwardOutcome::NotForwarded(req);
+    }
+
+    let leader = state.service.get_leader().await;
+    let leader = {
+        match leader {
+            Ok(Some(leader)) => leader,
+            Ok(None) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable(
+                    "No leader to forward to while in stepped down state".into(),
+                )));
+            }
+            Err(err) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
+                    anyhow::anyhow!(
+                        "Failed to get leader for forwarding while in stepped down state: {err}"
+                    ),
+                )));
+            }
+        }
+    };
+
+    let cfg = state.service.get_config();
+    if let Some(ref self_addr) = cfg.address_for_peers {
+        let leader_addr = match Uri::from_str(leader.address.as_str()) {
+            Ok(uri) => uri,
+            Err(err) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
+                    anyhow::anyhow!(
+                    "Failed to parse leader uri for forwarding while in stepped down state: {err}"
+                ),
+                )));
+            }
+        };
+
+        if *self_addr == leader_addr {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Leader is stepped down instance"
+            ))));
+        }
+    }
+
+    tracing::info!("Forwarding {} to leader at {}", uri, leader.address);
+
+    // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and
+    // include some leeway to get the timeout for proxied requests.
+    const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10);
+    let client = reqwest::ClientBuilder::new()
+        .timeout(PROXIED_REQUEST_TIMEOUT)
+        .build();
+    let client = match client {
+        Ok(client) => client,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to build leader client for forwarding while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    let request: reqwest::Request = match convert_request(req, &client, leader.address).await {
+        Ok(r) => r,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to convert request for forwarding while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    let response = match client.execute(request).await {
+        Ok(r) => r,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to forward while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    ForwardOutcome::Forwarded(convert_response(response).await)
+}
+
+/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through
+/// a stable representation (string, bytes or integer)
+///
+/// Ideally, we would not have to do this since both types use the http crate
+/// under the hood. However, they use different versions of the crate and keeping
+/// second order dependencies in sync is difficult.
+async fn convert_response(resp: reqwest::Response) -> Result<hyper::Response<Body>, ApiError> {
+    use std::str::FromStr;
+
+    let mut builder = hyper::Response::builder().status(resp.status().as_u16());
+    for (key, value) in resp.headers().into_iter() {
+        let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+        })?;
+
+        let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+        })?;
+
+        builder = builder.header(key, value);
+    }
+
+    let body = http::Body::wrap_stream(resp.bytes_stream());
+
+    builder.body(body).map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+    })
+}
+
+/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through
+/// a stable representation (string, bytes or integer)
+///
+/// See [`convert_response`] for why we are doing it this way.
+async fn convert_request(
+    req: hyper::Request<Body>,
+    client: &reqwest::Client,
+    to_address: String,
+) -> Result<reqwest::Request, ApiError> {
+    use std::str::FromStr;
+
+    let (parts, body) = req.into_parts();
+    let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    let path_and_query = parts.uri.path_and_query().ok_or_else(|| {
+        ApiError::InternalServerError(anyhow::anyhow!(
+            "Request conversion failed: no path and query"
+        ))
+    })?;
+
+    let uri = reqwest::Url::from_str(
+        format!(
+            "{}{}",
+            to_address.trim_end_matches("/"),
+            path_and_query.as_str()
+        )
+        .as_str(),
+    )
+    .map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    let mut headers = reqwest::header::HeaderMap::new();
+    for (key, value) in parts.headers.into_iter() {
+        let key = match key {
+            Some(k) => k,
+            None => {
+                continue;
+            }
+        };
+
+        let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })?;
+
+        let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })?;
+
+        headers.insert(key, value);
+    }
+
+    let body = hyper::body::to_bytes(body).await.map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    client
+        .request(method, uri)
+        .headers(headers)
+        .body(body)
+        .build()
+        .map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2e21f8fb46..2d72dbb2df 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2048,8 +2048,11 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     # Make a change to the tenant config to trigger a slow reconcile
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
     virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
-    env.storage_controller.allowed_errors.append(
-        ".*Accepted configuration update but reconciliation failed.*"
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Accepted configuration update but reconciliation failed.*",
+            ".*Leader is stepped down instance",
+        ]
     )
 
     observed_state = env.storage_controller.step_down()
@@ -2072,9 +2075,9 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     assert "compaction_threshold" in ps_tenant_conf.effective_config
     assert ps_tenant_conf.effective_config["compaction_threshold"] == 5
 
-    # Validate that the storcon is not replying to the usual requests
-    # once it has stepped down.
-    with pytest.raises(StorageControllerApiException, match="stepped_down"):
+    # Validate that the storcon attempts to forward the request, but stops.
+    # when it realises it is still the current leader.
+    with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"):
         env.storage_controller.tenant_list()
 
     # Validate that we can step down multiple times and the observed state
@@ -2221,6 +2224,15 @@ def test_storage_controller_leadership_transfer(
     env.storage_controller.wait_until_ready()
     env.storage_controller.consistency_check()
 
+    if not step_down_times_out:
+        # Check that the stepped down instance forwards requests
+        # to the new leader while it's still running.
+        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+        env.storage_controller.tenant_list()
+        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+        status = env.storage_controller.node_status(env.pageservers[0].id)
+        assert status["scheduling"] == "Pause"
+
     if step_down_times_out:
         env.storage_controller.allowed_errors.extend(
             [

From cd4276fd656b9e917ee315240d957d344fc2cfe6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 10:17:48 +0100
Subject: [PATCH 145/159] CI: fix release pipeline (#9017)

## Problem

We've got 2 non-blocking failures on the release pipeline:
- `promote-compatibility-data` job got skipped _presumably_ because one
of the dependencies of `deploy` job (`push-to-acr-dev`) got skipped
(https://github.com/neondatabase/neon/pull/8940)
- `coverage-report` job fails because we don't build debug artifacts in
the release branch (https://github.com/neondatabase/neon/pull/8561)

## Summary of changes
- Always run `push-to-acr-dev` / `push-to-acr-prod` jobs, but add
`skip_if` parameter to the reusable workflow, which can skip the job
internally, without skipping externally
- Do not run `coverage-report` on release branches
---
 .github/workflows/_push-to-acr.yml   | 8 +++++++-
 .github/workflows/build_and_test.yml | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
index 415b3d9cc6..7b6eba2c06 100644
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -26,9 +26,15 @@ on:
         description: Azure tenant ID
         required: true
         type: string
+      skip_if:
+        description: Skip the job if this expression is true
+        required: true
+        type: boolean
 
 jobs:
   push-to-acr:
+    if: ${{ !inputs.skip_if }}
+
     runs-on: ubuntu-22.04
     permissions:
       contents: read  # This is required for actions/checkout
@@ -52,5 +58,5 @@ jobs:
           for image in ${images}; do
             docker buildx imagetools create \
               -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                        neondatabase/${image}:${{ inputs.image_tag }}
+                                                        neondatabase/${image}:${{ inputs.image_tag }}
           done
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7c06fd9ab8..7ddd624dd5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -357,6 +357,7 @@ jobs:
             })
 
   coverage-report:
+    if: ${{ !startsWith(github.ref_name, 'release') }}
     needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
     runs-on: [ self-hosted, small ]
     container:
@@ -858,7 +859,6 @@ jobs:
           done
 
   push-to-acr-dev:
-    if: github.ref_name == 'main'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -868,9 +868,9 @@ jobs:
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
+      skip_if: ${{ github.ref_name != 'main' }}
 
   push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -880,6 +880,7 @@ jobs:
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
+      skip_if: ${{ !startsWith(github.ref_name, 'release') }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
@@ -957,7 +958,7 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest

From d211f00f054b80515684ff0887b76ecec60cc796 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 17:55:45 +0300
Subject: [PATCH 146/159] Remove unnecessary dependencies (#9000)

Found by "cargo machete"
---
 Cargo.lock                             | 144 ++-----------------------
 compute_tools/Cargo.toml               |   3 -
 control_plane/Cargo.toml               |   5 -
 control_plane/storcon_cli/Cargo.toml   |   3 -
 libs/compute_api/Cargo.toml            |   1 -
 libs/consumption_metrics/Cargo.toml    |   3 -
 libs/desim/Cargo.toml                  |   1 -
 libs/postgres_backend/Cargo.toml       |   2 -
 libs/postgres_ffi/Cargo.toml           |   3 -
 libs/postgres_ffi/wal_craft/Cargo.toml |   1 -
 libs/pq_proto/Cargo.toml               |   2 -
 libs/remote_storage/Cargo.toml         |   3 -
 libs/safekeeper_api/Cargo.toml         |   1 -
 libs/tracing-utils/Cargo.toml          |   5 +-
 libs/utils/Cargo.toml                  |   1 -
 libs/vm_monitor/Cargo.toml             |   2 -
 pageserver/Cargo.toml                  |   9 --
 pageserver/compaction/Cargo.toml       |  22 ----
 pageserver/ctl/Cargo.toml              |   2 -
 proxy/Cargo.toml                       |   7 --
 safekeeper/Cargo.toml                  |   5 -
 storage_broker/Cargo.toml              |   2 -
 storage_controller/Cargo.toml          |   2 -
 storage_controller/client/Cargo.toml   |  11 --
 storage_scrubber/Cargo.toml            |   8 --
 workspace_hack/Cargo.toml              |   3 +
 26 files changed, 15 insertions(+), 236 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3ca6acbc3e..136f07956f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1209,7 +1209,6 @@ dependencies = [
  "remote_storage",
  "serde",
  "serde_json",
- "serde_with",
  "utils",
 ]
 
@@ -1218,7 +1217,6 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-compression",
  "bytes",
  "cfg-if",
  "chrono",
@@ -1237,7 +1235,6 @@ dependencies = [
  "reqwest 0.12.4",
  "rlimit",
  "rust-ini",
- "serde",
  "serde_json",
  "signal-hook",
  "tar",
@@ -1246,7 +1243,6 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
- "toml_edit",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -1317,12 +1313,9 @@ dependencies = [
 name = "consumption_metrics"
 version = "0.1.0"
 dependencies = [
- "anyhow",
  "chrono",
  "rand 0.8.5",
  "serde",
- "serde_with",
- "utils",
 ]
 
 [[package]]
@@ -1334,9 +1327,7 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
- "futures",
  "git-version",
- "hex",
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
@@ -1344,7 +1335,6 @@ dependencies = [
  "once_cell",
  "pageserver_api",
  "pageserver_client",
- "postgres",
  "postgres_backend",
  "postgres_connection",
  "regex",
@@ -1353,9 +1343,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
- "serde_with",
  "storage_broker",
- "tar",
  "thiserror",
  "tokio",
  "tokio-postgres",
@@ -1663,7 +1651,6 @@ dependencies = [
  "hex",
  "parking_lot 0.12.1",
  "rand 0.8.5",
- "scopeguard",
  "smallvec",
  "tracing",
  "utils",
@@ -2233,24 +2220,22 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "git-version"
-version = "0.3.5"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899"
+checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19"
 dependencies = [
  "git-version-macro",
- "proc-macro-hack",
 ]
 
 [[package]]
 name = "git-version-macro"
-version = "0.3.5"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f"
+checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
- "proc-macro-hack",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2744,19 +2729,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "inotify"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
-dependencies = [
- "bitflags 1.3.2",
- "futures-core",
- "inotify-sys",
- "libc",
- "tokio",
-]
-
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -3251,7 +3223,7 @@ dependencies = [
  "crossbeam-channel",
  "filetime",
  "fsevent-sys",
- "inotify 0.9.6",
+ "inotify",
  "kqueue",
  "libc",
  "log",
@@ -3642,7 +3614,6 @@ name = "pagectl"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bytes",
  "camino",
  "clap",
  "git-version",
@@ -3651,7 +3622,6 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
- "serde",
  "serde_json",
  "svg_fmt",
  "thiserror",
@@ -3670,7 +3640,6 @@ dependencies = [
  "arc-swap",
  "async-compression",
  "async-stream",
- "async-trait",
  "bit_field",
  "byteorder",
  "bytes",
@@ -3678,16 +3647,13 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
- "const_format",
  "consumption_metrics",
  "crc32c",
  "criterion",
- "crossbeam-utils",
  "either",
  "enum-map",
  "enumset",
  "fail",
- "flate2",
  "futures",
  "git-version",
  "hex",
@@ -3726,13 +3692,9 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "serde_with",
- "signal-hook",
- "smallvec",
  "storage_broker",
  "strum",
  "strum_macros",
- "svg_fmt",
- "sync_wrapper",
  "sysinfo",
  "tenant_size_model",
  "thiserror",
@@ -3746,7 +3708,6 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
- "twox-hash",
  "url",
  "utils",
  "walkdir",
@@ -3810,44 +3771,22 @@ name = "pageserver_compaction"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-compression",
  "async-stream",
- "byteorder",
- "bytes",
- "chrono",
  "clap",
- "const_format",
- "consumption_metrics",
  "criterion",
- "crossbeam-utils",
- "either",
- "fail",
- "flate2",
  "futures",
  "git-version",
- "hex",
  "hex-literal",
- "humantime",
- "humantime-serde",
  "itertools 0.10.5",
- "metrics",
  "once_cell",
  "pageserver_api",
  "pin-project-lite",
  "rand 0.8.5",
- "smallvec",
  "svg_fmt",
- "sync_wrapper",
- "thiserror",
  "tokio",
- "tokio-io-timeout",
- "tokio-util",
  "tracing",
- "tracing-error",
  "tracing-subscriber",
- "url",
  "utils",
- "walkdir",
  "workspace_hack",
 ]
 
@@ -4164,9 +4103,7 @@ name = "postgres_backend"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "bytes",
- "futures",
  "once_cell",
  "pq_proto",
  "rustls 0.22.4",
@@ -4199,16 +4136,13 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "bindgen",
- "byteorder",
  "bytes",
  "crc32c",
  "env_logger",
- "hex",
  "log",
  "memoffset 0.8.0",
  "once_cell",
  "postgres",
- "rand 0.8.5",
  "regex",
  "serde",
  "thiserror",
@@ -4243,13 +4177,11 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
  "serde",
  "thiserror",
  "tokio",
- "tracing",
 ]
 
 [[package]]
@@ -4281,12 +4213,6 @@ dependencies = [
  "elliptic-curve 0.13.8",
 ]
 
-[[package]]
-name = "proc-macro-hack"
-version = "0.5.20+deprecated"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
-
 [[package]]
 name = "proc-macro2"
 version = "1.0.78"
@@ -4405,7 +4331,6 @@ dependencies = [
  "aws-config",
  "aws-sdk-iam",
  "aws-sigv4",
- "aws-types",
  "base64 0.13.1",
  "bstr",
  "bytes",
@@ -4414,7 +4339,6 @@ dependencies = [
  "chrono",
  "clap",
  "consumption_metrics",
- "crossbeam-deque",
  "dashmap",
  "ecdsa 0.16.9",
  "env_logger",
@@ -4440,11 +4364,9 @@ dependencies = [
  "jose-jwa",
  "jose-jwk",
  "lasso",
- "md5",
  "measured",
  "metrics",
  "once_cell",
- "opentelemetry",
  "p256 0.13.2",
  "parking_lot 0.12.1",
  "parquet",
@@ -4465,7 +4387,6 @@ dependencies = [
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
- "routerify",
  "rsa",
  "rstest",
  "rustc-hash",
@@ -4481,7 +4402,6 @@ dependencies = [
  "smol_str",
  "socket2 0.5.5",
  "subtle",
- "task-local-extensions",
  "thiserror",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
@@ -4491,7 +4411,6 @@ dependencies = [
  "tokio-rustls 0.25.0",
  "tokio-tungstenite",
  "tokio-util",
- "tower-service",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -4781,7 +4700,6 @@ dependencies = [
  "async-stream",
  "async-trait",
  "aws-config",
- "aws-credential-types",
  "aws-sdk-s3",
  "aws-smithy-async",
  "aws-smithy-types",
@@ -4795,7 +4713,6 @@ dependencies = [
  "futures",
  "futures-util",
  "http-types",
- "humantime",
  "humantime-serde",
  "hyper 0.14.26",
  "itertools 0.10.5",
@@ -5275,14 +5192,12 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-stream",
- "async-trait",
  "byteorder",
  "bytes",
  "camino",
  "camino-tempfile",
  "chrono",
  "clap",
- "const_format",
  "crc32c",
  "desim",
  "fail",
@@ -5308,9 +5223,7 @@ dependencies = [
  "sd-notify",
  "serde",
  "serde_json",
- "serde_with",
  "sha2",
- "signal-hook",
  "storage_broker",
  "strum",
  "strum_macros",
@@ -5321,7 +5234,6 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -5336,7 +5248,6 @@ version = "0.1.0"
 dependencies = [
  "const_format",
  "serde",
- "serde_with",
  "utils",
 ]
 
@@ -5865,7 +5776,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-stream",
- "bytes",
  "clap",
  "const_format",
  "futures",
@@ -5879,7 +5789,6 @@ dependencies = [
  "parking_lot 0.12.1",
  "prost",
  "tokio",
- "tokio-stream",
  "tonic",
  "tonic-build",
  "tracing",
@@ -5892,9 +5801,7 @@ name = "storage_controller"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "aws-config",
  "bytes",
- "camino",
  "chrono",
  "clap",
  "control_plane",
@@ -5935,20 +5842,9 @@ dependencies = [
 name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
- "anyhow",
- "bytes",
- "futures",
- "pageserver_api",
  "pageserver_client",
- "postgres",
  "reqwest 0.12.4",
  "serde",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-stream",
- "tokio-util",
- "utils",
  "workspace_hack",
 ]
 
@@ -5960,13 +5856,9 @@ dependencies = [
  "async-stream",
  "aws-config",
  "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
  "camino",
  "chrono",
  "clap",
- "crc32c",
  "either",
  "futures",
  "futures-util",
@@ -5978,20 +5870,16 @@ dependencies = [
  "pageserver",
  "pageserver_api",
  "postgres_ffi",
- "rand 0.8.5",
  "remote_storage",
  "reqwest 0.12.4",
  "rustls 0.22.4",
  "rustls-native-certs 0.7.0",
  "serde",
  "serde_json",
- "serde_with",
  "storage_controller_client",
- "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
  "tokio-stream",
  "tokio-util",
  "tracing",
@@ -6010,14 +5898,11 @@ dependencies = [
  "comfy-table",
  "futures",
  "humantime",
- "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
  "reqwest 0.12.4",
- "serde",
  "serde_json",
  "storage_controller_client",
- "thiserror",
  "tokio",
  "tracing",
  "utils",
@@ -6140,15 +6025,6 @@ dependencies = [
  "xattr",
 ]
 
-[[package]]
-name = "task-local-extensions"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8"
-dependencies = [
- "pin-utils",
-]
-
 [[package]]
 name = "tempfile"
 version = "3.9.0"
@@ -6739,7 +6615,6 @@ dependencies = [
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
- "reqwest 0.12.4",
  "tokio",
  "tracing",
  "tracing-opentelemetry",
@@ -6943,7 +6818,6 @@ dependencies = [
  "serde_assert",
  "serde_json",
  "serde_path_to_error",
- "serde_with",
  "signal-hook",
  "strum",
  "strum_macros",
@@ -6999,13 +6873,11 @@ dependencies = [
  "cgroups-rs",
  "clap",
  "futures",
- "inotify 0.10.2",
  "serde",
  "serde_json",
  "sysinfo",
  "tokio",
  "tokio-postgres",
- "tokio-stream",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
@@ -7032,7 +6904,6 @@ dependencies = [
  "clap",
  "env_logger",
  "log",
- "once_cell",
  "postgres",
  "postgres_ffi",
  "regex",
@@ -7555,6 +7426,7 @@ dependencies = [
  "digest",
  "either",
  "fail",
+ "futures",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7610,6 +7482,8 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
  "url",
  "uuid",
  "zeroize",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8af0ed43ce..00a82e4be6 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,7 +11,6 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
@@ -24,7 +23,6 @@ num_cpus.workspace = true
 opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
-serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
@@ -43,7 +41,6 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
-toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 6fca59b368..c185d20484 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,13 +9,10 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-futures.workspace = true
 git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
-postgres.workspace = true
-hex.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
@@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
-tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index be69208d0d..ce89116691 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -11,14 +11,11 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
-hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
-serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
-thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
index 8aaa481f8c..c0ec40a6c2 100644
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 serde.workspace = true
-serde_with.workspace = true
 serde_json.workspace = true
 regex.workspace = true
 
diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml
index a40b74b952..0e517e3856 100644
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -5,9 +5,6 @@ edition = "2021"
 license = "Apache-2.0"
 
 [dependencies]
-anyhow.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
-serde_with.workspace = true
-utils.workspace = true
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
index 0c4be90267..473f3a2a13 100644
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -12,5 +12,4 @@ bytes.workspace = true
 utils.workspace = true
 parking_lot.workspace = true
 hex.workspace = true
-scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index f6854328fc..a0c87263ed 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -5,10 +5,8 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-async-trait.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
-futures.workspace = true
 rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index ee69878f69..ef17833a48 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -5,13 +5,10 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-rand.workspace = true
 regex.workspace = true
 bytes.workspace = true
-byteorder.workspace = true
 anyhow.workspace = true
 crc32c.workspace = true
-hex.workspace = true
 once_cell.workspace = true
 log.workspace = true
 memoffset.workspace = true
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 29dd01a936..14c7d2e340 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -9,7 +9,6 @@ anyhow.workspace = true
 clap.workspace = true
 env_logger.workspace = true
 log.workspace = true
-once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 66bbe03ebc..9524a1490d 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -8,10 +8,8 @@ license.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
 itertools.workspace = true
-pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
-tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 02adee058f..f48f1801a4 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,14 +13,11 @@ aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
-aws-credential-types.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
-humantime.workspace = true
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
-rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index e1f4bcca46..14811232d3 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -6,6 +6,5 @@ license.workspace = true
 
 [dependencies]
 serde.workspace = true
-serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 5ea8db6b42..05eb538d42 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -9,8 +9,9 @@ hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
 opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
-reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
-tracing-subscriber.workspace = true
+
+[dev-dependencies]
+tracing-subscriber.workspace = true    # For examples in docs
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 19deaab63f..f199b15554 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -42,7 +42,6 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
-serde_with.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml
index 46e9f880a1..ba73902d38 100644
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -15,13 +15,11 @@ anyhow.workspace = true
 axum.workspace = true
 clap.workspace = true
 futures.workspace = true
-inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
-tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 24373afca3..0eb48d6823 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -15,7 +15,6 @@ anyhow.workspace = true
 arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
-async-trait.workspace = true
 bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
@@ -23,12 +22,9 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
-crossbeam-utils.workspace = true
 either.workspace = true
-flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -57,10 +53,6 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
-signal-hook.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
@@ -73,7 +65,6 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
-twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml
index 0fd1d81845..52b58fc298 100644
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -9,41 +9,19 @@ default = []
 
 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 async-stream.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
 itertools.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pin-project-lite.workspace = true
 rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
 tracing.workspace = true
-tracing-error.workspace = true
 tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index be5626040b..9592002de1 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
-bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
@@ -24,5 +23,4 @@ toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
-serde.workspace = true
 serde_json.workspace = true
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 21d92abb20..6703eb06eb 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -18,7 +18,6 @@ atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
-aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -26,7 +25,6 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
-crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -48,11 +46,9 @@ indexmap.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
-md5.workspace = true
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
-opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -67,7 +63,6 @@ reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
-routerify.workspace = true
 rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -79,7 +74,6 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
-task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
@@ -88,7 +82,6 @@ tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
-tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 0fdb3147bf..daf21c70b0 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -13,14 +13,12 @@ testing = ["fail/failpoints"]
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
-async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
 camino-tempfile.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
-const_format.workspace = true
 crc32c.workspace = true
 fail.workspace = true
 git-version.workspace = true
@@ -38,8 +36,6 @@ scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
-signal-hook.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
@@ -48,7 +44,6 @@ tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-tar.workspace = true
-toml_edit.workspace = true
 tracing.workspace = true
 url.workspace = true
 metrics.workspace = true
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index ac4b00669e..82ec0aa272 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -10,7 +10,6 @@ bench = []
 [dependencies]
 anyhow.workspace = true
 async-stream.workspace = true
-bytes.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 futures.workspace = true
@@ -24,7 +23,6 @@ parking_lot.workspace = true
 prost.workspace = true
 tonic.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
-tokio-stream.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index ecaac04915..a96d64e096 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -15,9 +15,7 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
-aws-config.workspace = true
 bytes.workspace = true
-camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml
index e7a4264fd0..9fa89176af 100644
--- a/storage_controller/client/Cargo.toml
+++ b/storage_controller/client/Cargo.toml
@@ -5,18 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-pageserver_api.workspace = true
 pageserver_client.workspace = true
-thiserror.workspace = true
 reqwest.workspace = true
-utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
-tokio-postgres.workspace = true
-tokio-stream.workspace = true
-tokio.workspace = true
-futures.workspace = true
-tokio-util.workspace = true
-anyhow.workspace = true
-postgres.workspace = true
-bytes.workspace = true
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index d19119990b..f9987662b9 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -6,21 +6,13 @@ license.workspace = true
 
 [dependencies]
 aws-sdk-s3.workspace = true
-aws-smithy-async.workspace = true
 either.workspace = true
-tokio-rustls.workspace = true
 anyhow.workspace = true
 git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
-thiserror.workspace = true
-rand.workspace = true
-bytes.workspace = true
-bincode.workspace = true
-crc32c.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 140c43639e..662916d42c 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,6 +38,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -88,6 +89,8 @@ tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive", "serde"] }

From 6138eb50e9fc5fb6bc7af57cb95cbad8aaedc186 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 16 Sep 2024 17:53:57 -0500
Subject: [PATCH 147/159] Fix test code related to migrations

We added another migration in 5876c441abc973acca60882192ad46333c075abd,
but didn't bump this value. This had no effect, but best to fix it
anyway.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 93b93ff019..69284bfdfc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4094,7 +4094,7 @@ class Endpoint(PgProtocol, LogUtils):
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     # Please note: Migrations only run if pg_skip_catalog_updates is false
-    def wait_for_migrations(self, num_migrations: int = 10):
+    def wait_for_migrations(self, num_migrations: int = 11):
         with self.cursor() as cur:
 
             def check_migrations_done():

From a1b71b73fe8f1193ef841604af962053c0bf1061 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 17 Sep 2024 19:15:01 +0200
Subject: [PATCH 148/159] Rename some S3 usages to "remote storage" in exposed
 messages (#8999)

In exposed messages like log messages we mentioned "S3", which is not
entirely accurate as we support Azure blob storage now as well.
---
 pageserver/src/consumption_metrics.rs                    | 2 +-
 pageserver/src/metrics.rs                                | 2 +-
 test_runner/fixtures/pageserver/many_tenants.py          | 2 +-
 test_runner/regress/test_pageserver_metric_collection.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 64a267e0e4..0c7630edca 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -178,7 +178,7 @@ async fn collect_metrics(
                 )
                 .await;
                 if let Err(e) = res {
-                    tracing::error!("failed to upload to S3: {e:#}");
+                    tracing::error!("failed to upload to remote storage: {e:#}");
                 }
             }
         };
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9197505876..72229d80be 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1777,7 +1777,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
     .expect("failed to define a metric"),
     upload_heatmap_duration: register_histogram!(
         "pageserver_secondary_upload_heatmap_duration",
-        "Time to build and upload a heatmap, including any waiting inside the S3 client"
+        "Time to build and upload a heatmap, including any waiting inside the remote storage client"
     )
     .expect("failed to define a metric"),
     download_heatmap: register_int_counter!(
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 3e0ffabf74..97e63ed4ba 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -39,7 +39,7 @@ def single_timeline(
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
 
-    log.info(f"duplicating template tenant {ncopies} times in S3")
+    log.info(f"duplicating template tenant {ncopies} times in remote storage")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
 
     # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 24a37b04ec..37ab51f9fb 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -74,7 +74,7 @@ def test_metric_collection(
     env.pageserver.allowed_errors.extend(
         [
             ".*metrics endpoint refused the sent metrics*",
-            ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
+            ".*metrics_collection: failed to upload to remote storage: Failed to upload data of length .* to storage path.*",
         ]
     )
 

From d78f5ce6da2866cb982203d1bab33c49a990cd5f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 18:40:05 +0100
Subject: [PATCH 149/159] CI: don't fetch the whole git history if it's not
 required (#9021)

## Problem
We do use `actions/checkout` with `fetch-depth: 0` when it's not
required

## Summary of changes
- Remove unneeded `fetch-depth: 0`
- Add a comment if `fetch-depth: 0` is required
---
 .github/workflows/build_and_test.yml    | 31 +++++++------------------
 .github/workflows/trigger-e2e-tests.yml |  4 ++--
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7ddd624dd5..d46b8dc1f5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -54,8 +54,8 @@ jobs:
       build-tag: ${{steps.build-tag.outputs.tag}}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -374,8 +374,8 @@ jobs:
         coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
         coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
+      - uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
@@ -476,11 +476,9 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 0
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
@@ -555,11 +553,9 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 0
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
@@ -706,10 +702,7 @@ jobs:
       VM_BUILDER_VERSION: v0.29.3
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - name: Downloading vm-builder
         run: |
@@ -749,10 +742,7 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/login-action@v3
@@ -977,10 +967,7 @@ jobs:
             git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
           done
 
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - name: Trigger deploy workflow
         env:
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 6fbe785c56..b299cf9b99 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -34,8 +34,8 @@ jobs:
       build-tag: ${{ steps.build-tag.outputs.tag }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 

From 3cd2a3f9317f54e75c00a813680060dcaa612a36 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 17 Sep 2024 20:16:33 +0100
Subject: [PATCH 150/159] refactor(walredo): process launch & kill-on-error
 machinery (#8951)

Immediate benefit: easier to spot what's going on.

Later benefit: use the extracted method in PR

- https://github.com/neondatabase/neon/pull/8952

which adds a `ping` command to walredo.

Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.
---
 pageserver/src/walredo.rs | 255 +++++++++++++++++++++-----------------
 1 file changed, 140 insertions(+), 115 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a36955fa21..0fe7def8b0 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -35,6 +35,7 @@ use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
+use std::future::Future;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
@@ -296,6 +297,97 @@ impl PostgresRedoManager {
         }
     }
 
+    async fn do_with_walredo_process<
+        F: FnOnce(Arc<Process>) -> Fut,
+        Fut: Future<Output = Result<O, Error>>,
+        O,
+    >(
+        &self,
+        pg_version: u32,
+        closure: F,
+    ) -> Result<O, Error> {
+        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+            Ok(guard) => match &*guard {
+                ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                ProcessOnceCell::ManagerShutDown => {
+                    return Err(Error::Cancelled);
+                }
+            },
+            Err(permit) => {
+                let start = Instant::now();
+                // acquire guard before spawning process, so that we don't spawn new processes
+                // if the gate is already closed.
+                let _launched_processes_guard = match self.launched_processes.enter() {
+                    Ok(guard) => guard,
+                    Err(GateError::GateClosed) => unreachable!(
+                        "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                    ),
+                };
+                let proc = Arc::new(Process {
+                    process: process::WalRedoProcess::launch(
+                        self.conf,
+                        self.tenant_shard_id,
+                        pg_version,
+                    )
+                    .context("launch walredo process")?,
+                    _launched_processes_guard,
+                });
+                let duration = start.elapsed();
+                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                info!(
+                    elapsed_ms = duration.as_millis(),
+                    pid = proc.id(),
+                    "launched walredo process"
+                );
+                self.redo_process
+                    .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                proc
+            }
+        };
+
+        // async closures are unstable, would support &Process
+        let result = closure(proc.clone()).await;
+
+        if result.is_err() {
+            // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+            // Note that there may be other tasks concurrent with us that also hold `proc`.
+            // We have to deal with that here.
+            // Also read the doc comment on field `self.redo_process`.
+            //
+            // NB: there may still be other concurrent threads using `proc`.
+            // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
+            //
+            // NB: the drop impl blocks the dropping thread with a wait() system call for
+            // the child process. In some ways the blocking is actually good: if we
+            // deferred the waiting into the background / to tokio if we used `tokio::process`,
+            // it could happen that if walredo always fails immediately, we spawn processes faster
+            // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
+            // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
+            // This probably needs revisiting at some later point.
+            match self.redo_process.get() {
+                None => (),
+                Some(guard) => {
+                    match &*guard {
+                        ProcessOnceCell::ManagerShutDown => {}
+                        ProcessOnceCell::Spawned(guard_proc) => {
+                            if Arc::ptr_eq(&proc, guard_proc) {
+                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                guard.take_and_deinit();
+                            } else {
+                                // Another task already spawned another redo process (further up in this method)
+                                // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                            }
+                        }
+                    }
+                }
+            }
+            // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
+            drop(proc);
+        }
+
+        result
+    }
+
     ///
     /// Process one request for WAL redo using wal-redo postgres
     ///
@@ -319,130 +411,63 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
-                Ok(guard) => match &*guard {
-                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
-                    ProcessOnceCell::ManagerShutDown => {
-                        return Err(Error::Cancelled);
-                    }
-                },
-                Err(permit) => {
-                    let start = Instant::now();
-                    // acquire guard before spawning process, so that we don't spawn new processes
-                    // if the gate is already closed.
-                    let _launched_processes_guard = match self.launched_processes.enter() {
-                                Ok(guard) => guard,
-                                Err(GateError::GateClosed) => unreachable!(
-                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
-                                ),
-                            };
-                    let proc = Arc::new(Process {
-                        process: process::WalRedoProcess::launch(
-                            self.conf,
-                            self.tenant_shard_id,
-                            pg_version,
-                        )
-                        .context("launch walredo process")?,
-                        _launched_processes_guard,
-                    });
-                    let duration = start.elapsed();
-                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                    info!(
-                        duration_ms = duration.as_millis(),
-                        pid = proc.id(),
-                        "launched walredo process"
-                    );
-                    self.redo_process
-                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
-                    proc
-                }
-            };
+            let base_img = &base_img;
+            let closure = |proc: Arc<Process>| async move {
+                let started_at = std::time::Instant::now();
 
-            let started_at = std::time::Instant::now();
+                // Relational WAL records are applied using wal-redo-postgres
+                let result = proc
+                    .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
+                    .context("apply_wal_records");
 
-            // Relational WAL records are applied using wal-redo-postgres
-            let result = proc
-                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
-                .await
-                .context("apply_wal_records");
+                let duration = started_at.elapsed();
 
-            let duration = started_at.elapsed();
-
-            let len = records.len();
-            let nbytes = records.iter().fold(0, |acumulator, record| {
-                acumulator
-                    + match &record.1 {
-                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                    }
-            });
-
-            WAL_REDO_TIME.observe(duration.as_secs_f64());
-            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
-
-            debug!(
-                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-                len,
-                nbytes,
-                duration.as_micros(),
-                lsn
-            );
-
-            // If something went wrong, don't try to reuse the process. Kill it, and
-            // next request will launch a new one.
-            if let Err(e) = result.as_ref() {
-                error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                    records.len(),
-                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                    nbytes,
-                    base_img_lsn,
-                    lsn,
-                    n_attempts,
-                    e,
-                );
-                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
-                // Note that there may be other tasks concurrent with us that also hold `proc`.
-                // We have to deal with that here.
-                // Also read the doc comment on field `self.redo_process`.
-                //
-                // NB: there may still be other concurrent threads using `proc`.
-                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                //
-                // NB: the drop impl blocks the dropping thread with a wait() system call for
-                // the child process. In some ways the blocking is actually good: if we
-                // deferred the waiting into the background / to tokio if we used `tokio::process`,
-                // it could happen that if walredo always fails immediately, we spawn processes faster
-                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
-                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
-                // This probably needs revisiting at some later point.
-                match self.redo_process.get() {
-                    None => (),
-                    Some(guard) => {
-                        match &*guard {
-                            ProcessOnceCell::ManagerShutDown => {}
-                            ProcessOnceCell::Spawned(guard_proc) => {
-                                if Arc::ptr_eq(&proc, guard_proc) {
-                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                    guard.take_and_deinit();
-                                } else {
-                                    // Another task already spawned another redo process (further up in this method)
-                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
-                                }
-                            }
+                let len = records.len();
+                let nbytes = records.iter().fold(0, |acumulator, record| {
+                    acumulator
+                        + match &record.1 {
+                            NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                            _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
                         }
-                    }
+                });
+
+                WAL_REDO_TIME.observe(duration.as_secs_f64());
+                WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+                WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+
+                debug!(
+                    "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+                    len,
+                    nbytes,
+                    duration.as_micros(),
+                    lsn
+                );
+
+                if let Err(e) = result.as_ref() {
+                    error!(
+                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                        records.len(),
+                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                        nbytes,
+                        base_img_lsn,
+                        lsn,
+                        n_attempts,
+                        e,
+                    );
                 }
-                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
-                drop(proc);
-            } else if n_attempts != 0 {
+
+                result.map_err(Error::Other)
+            };
+            let result = self.do_with_walredo_process(pg_version, closure).await;
+
+            if result.is_ok() && n_attempts != 0 {
                 info!(n_attempts, "retried walredo succeeded");
             }
             n_attempts += 1;
             if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result.map_err(Error::Other);
+                return result;
             }
         }
     }

From 135e7e4306978d79a133086abb81707d8d9fb8a9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 18 Sep 2024 09:10:27 +0200
Subject: [PATCH 151/159] add `neon_local` subcommand for the broker & use that
 from regression tests (#8948)

There's currently no way to just start/stop broker from `neon_local`.

This PR
* adds a sub-command
* uses that sub-command from the test suite instead of the pre-existing
Python `subprocess` based approach.

Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.
---
 control_plane/src/bin/neon_local.rs           | 40 ++++++++++
 test_runner/fixtures/broker.py                | 63 ---------------
 test_runner/fixtures/neon_fixtures.py         | 76 +++++++++++++------
 test_runner/regress/test_neon_cli.py          |  2 +
 .../regress/test_pageserver_generations.py    |  2 +-
 .../regress/test_storage_controller.py        |  6 +-
 test_runner/regress/test_wal_acceptor.py      |  9 +--
 7 files changed, 99 insertions(+), 99 deletions(-)
 delete mode 100644 test_runner/fixtures/broker.py

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 144cd647c9..1c94c42865 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -106,6 +106,7 @@ fn main() -> Result<()> {
             "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
+            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
             "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
             "mappings" => handle_mappings(sub_args, &mut env),
@@ -1245,6 +1246,32 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
+async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match sub_match.subcommand() {
+        Some(broker_command_data) => broker_command_data,
+        None => bail!("no broker subcommand provided"),
+    };
+
+    match sub_name {
+        "start" => {
+            if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
+                eprintln!("broker start failed: {e}");
+                exit(1);
+            }
+        }
+
+        "stop" => {
+            if let Err(e) = broker::stop_broker_process(env) {
+                eprintln!("broker stop failed: {e}");
+                exit(1);
+            }
+        }
+
+        _ => bail!("Unexpected broker subcommand '{}'", sub_name),
+    }
+    Ok(())
+}
+
 async fn handle_start_all(
     env: &local_env::LocalEnv,
     retry_timeout: &Duration,
@@ -1672,6 +1699,19 @@ fn cli() -> Command {
                             .arg(stop_mode_arg.clone())
                             .arg(instance_id))
         )
+        .subcommand(
+            Command::new("storage_broker")
+                .arg_required_else_help(true)
+                .about("Manage broker")
+                .subcommand(Command::new("start")
+                            .about("Start broker")
+                            .arg(timeout_arg.clone())
+                )
+                .subcommand(Command::new("stop")
+                            .about("Stop broker")
+                            .arg(stop_mode_arg.clone())
+                )
+        )
         .subcommand(
             Command::new("safekeeper")
                 .arg_required_else_help(true)
diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py
deleted file mode 100644
index 8aca90a097..0000000000
--- a/test_runner/fixtures/broker.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import subprocess
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Optional
-
-from fixtures.log_helper import log
-
-
-@dataclass
-class NeonBroker:
-    """An object managing storage_broker instance"""
-
-    logfile: Path
-    port: int
-    neon_binpath: Path
-    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
-
-    def listen_addr(self):
-        return f"127.0.0.1:{self.port}"
-
-    def client_url(self):
-        return f"http://{self.listen_addr()}"
-
-    def check_status(self):
-        return True  # TODO
-
-    def try_start(self):
-        if self.handle is not None:
-            log.debug(f"storage_broker is already running on port {self.port}")
-            return
-
-        listen_addr = self.listen_addr()
-        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
-        with open(self.logfile, "wb") as logfile:
-            args = [
-                str(self.neon_binpath / "storage_broker"),
-                f"--listen-addr={listen_addr}",
-            ]
-            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
-
-        # wait for start
-        started_at = time.time()
-        while True:
-            try:
-                self.check_status()
-            except Exception as e:
-                elapsed = time.time() - started_at
-                if elapsed > 5:
-                    raise RuntimeError(
-                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
-                    ) from e
-                time.sleep(0.5)
-            else:
-                break  # success
-
-    def stop(self, immediate: bool = False):
-        if self.handle is not None:
-            if immediate:
-                self.handle.kill()
-            else:
-                self.handle.terminate()
-            self.handle.wait()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 69284bfdfc..cbbb162cc6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -60,7 +60,6 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.auth_tokens import AuthKeys, TokenScope
-from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
@@ -230,21 +229,6 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib
     return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)
 
 
-@pytest.fixture(scope="function")
-def default_broker(
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    neon_binpath: Path,
-) -> Iterator[NeonBroker]:
-    # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
-    client_port = port_distributor.get_port()
-    broker_logfile = test_output_dir / "repo" / "storage_broker.log"
-
-    broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
-    yield broker
-    broker.stop()
-
-
 @pytest.fixture(scope="session")
 def run_id() -> Iterator[uuid.UUID]:
     yield uuid.uuid4()
@@ -387,7 +371,6 @@ class NeonEnvBuilder:
         self,
         repo_dir: Path,
         port_distributor: PortDistributor,
-        broker: NeonBroker,
         run_id: uuid.UUID,
         mock_s3_server: MockS3Server,
         neon_binpath: Path,
@@ -428,7 +411,6 @@ class NeonEnvBuilder:
         # Safekeepers remote storage
         self.safekeepers_remote_storage: Optional[RemoteStorage] = None
 
-        self.broker = broker
         self.run_id = run_id
         self.mock_s3_server: MockS3Server = mock_s3_server
         self.pageserver_config_override = pageserver_config_override
@@ -940,6 +922,8 @@ class NeonEnvBuilder:
 
             self.env.storage_controller.assert_no_errors()
 
+            self.env.broker.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -993,7 +977,7 @@ class NeonEnv:
         self.endpoints = EndpointFactory(self)
         self.safekeepers: List[Safekeeper] = []
         self.pageservers: List[NeonPageserver] = []
-        self.broker = config.broker
+        self.broker = NeonBroker(self)
         self.pageserver_remote_storage = config.pageserver_remote_storage
         self.safekeepers_remote_storage = config.safekeepers_remote_storage
         self.pg_version = config.pg_version
@@ -1168,7 +1152,7 @@ class NeonEnv:
             max_workers=2 + len(self.pageservers) + len(self.safekeepers)
         ) as executor:
             futs.append(
-                executor.submit(lambda: self.broker.try_start() or None)
+                executor.submit(lambda: self.broker.start() or None)
             )  # The `or None` is for the linter
 
             for pageserver in self.pageservers:
@@ -1225,7 +1209,7 @@ class NeonEnv:
                 pageserver.stop(immediate=immediate)
             except RuntimeError:
                 stop_later.append(pageserver)
-        self.broker.stop(immediate=immediate)
+        self.broker.stop()
 
         # TODO: for nice logging we need python 3.11 ExceptionGroup
         for ps in stop_later:
@@ -1339,7 +1323,6 @@ def neon_simple_env(
     pytestconfig: Config,
     port_distributor: PortDistributor,
     mock_s3_server: MockS3Server,
-    default_broker: NeonBroker,
     run_id: uuid.UUID,
     top_output_dir: Path,
     test_output_dir: Path,
@@ -1364,7 +1347,6 @@ def neon_simple_env(
         top_output_dir=top_output_dir,
         repo_dir=repo_dir,
         port_distributor=port_distributor,
-        broker=default_broker,
         mock_s3_server=mock_s3_server,
         neon_binpath=neon_binpath,
         pg_distrib_dir=pg_distrib_dir,
@@ -1392,7 +1374,6 @@ def neon_env_builder(
     neon_binpath: Path,
     pg_distrib_dir: Path,
     pg_version: PgVersion,
-    default_broker: NeonBroker,
     run_id: uuid.UUID,
     request: FixtureRequest,
     test_overlay_dir: Path,
@@ -1428,7 +1409,6 @@ def neon_env_builder(
         neon_binpath=neon_binpath,
         pg_distrib_dir=pg_distrib_dir,
         pg_version=pg_version,
-        broker=default_broker,
         run_id=run_id,
         preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
@@ -1850,6 +1830,18 @@ class NeonCli(AbstractNeonCli):
             args.extend(["-m", "immediate"])
         return self.raw_cli(args)
 
+    def broker_start(
+        self, timeout_in_seconds: Optional[int] = None
+    ) -> "subprocess.CompletedProcess[str]":
+        cmd = ["storage_broker", "start"]
+        if timeout_in_seconds is not None:
+            cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        return self.raw_cli(cmd)
+
+    def broker_stop(self) -> "subprocess.CompletedProcess[str]":
+        cmd = ["storage_broker", "stop"]
+        return self.raw_cli(cmd)
+
     def endpoint_create(
         self,
         branch_name: str,
@@ -4573,6 +4565,40 @@ class Safekeeper(LogUtils):
         wait_until(20, 0.5, paused)
 
 
+class NeonBroker(LogUtils):
+    """An object managing storage_broker instance"""
+
+    def __init__(self, env: NeonEnv):
+        super().__init__(logfile=env.repo_dir / "storage_broker.log")
+        self.env = env
+        self.port: int = self.env.port_distributor.get_port()
+        self.running = False
+
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+    ):
+        assert not self.running
+        self.env.neon_cli.broker_start(timeout_in_seconds)
+        self.running = True
+        return self
+
+    def stop(self):
+        if self.running:
+            self.env.neon_cli.broker_stop()
+            self.running = False
+        return self
+
+    def listen_addr(self):
+        return f"127.0.0.1:{self.port}"
+
+    def client_url(self):
+        return f"http://{self.listen_addr()}"
+
+    def assert_no_errors(self):
+        assert_no_errors(self.logfile, "storage_controller", [])
+
+
 # TODO: Replace with `StrEnum` when we upgrade to python 3.11
 class NodeKind(str, Enum):
     PAGESERVER = "pageserver"
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index ba170cfb4c..b65430ff49 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,6 +134,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.broker_stop()
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
     env.pageserver.running = False
@@ -176,6 +177,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
 
     # Stop this to get out of the way of the following `start`
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.broker_stop()
 
     # Default start
     res = env.neon_cli.raw_cli(["start"])
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index ebf58d2bd1..c923713432 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -134,7 +134,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     )
 
     env = neon_env_builder.init_configs()
-    env.broker.try_start()
+    env.broker.start()
     for sk in env.safekeepers:
         sk.start()
     env.storage_controller.start()
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2d72dbb2df..dc90a6e9a0 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -69,7 +69,7 @@ def test_storage_controller_smoke(
     env = neon_env_builder.init_configs()
 
     # Start services by hand so that we can skip a pageserver (this will start + register later)
-    env.broker.try_start()
+    env.broker.start()
     env.storage_controller.start()
     env.pageservers[0].start()
     env.pageservers[1].start()
@@ -292,7 +292,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
 
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
-    env.broker.try_start()
+    env.broker.start()
     env.storage_controller.start()
 
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
@@ -2126,7 +2126,7 @@ def start_env(env: NeonEnv, storage_controller_port: int):
         max_workers=2 + len(env.pageservers) + len(env.safekeepers)
     ) as executor:
         futs.append(
-            executor.submit(lambda: env.broker.try_start() or None)
+            executor.submit(lambda: env.broker.start() or None)
         )  # The `or None` is for the linter
 
         for pageserver in env.pageservers:
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 50fac441c0..4bf8cfe88f 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -19,7 +19,6 @@ import psycopg2.errors
 import psycopg2.extras
 import pytest
 import requests
-from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
@@ -1439,11 +1438,7 @@ class SafekeeperEnv:
     ):
         self.repo_dir = repo_dir
         self.port_distributor = port_distributor
-        self.broker = NeonBroker(
-            logfile=Path(self.repo_dir) / "storage_broker.log",
-            port=self.port_distributor.get_port(),
-            neon_binpath=neon_binpath,
-        )
+        self.fake_broker_endpoint = f"http://127.0.0.1:{port_distributor.get_port()}"
         self.pg_bin = pg_bin
         self.num_safekeepers = num_safekeepers
         self.bin_safekeeper = str(neon_binpath / "safekeeper")
@@ -1492,7 +1487,7 @@ class SafekeeperEnv:
             "--id",
             str(i),
             "--broker-endpoint",
-            self.broker.client_url(),
+            self.fake_broker_endpoint,
         ]
         log.info(f'Running command "{" ".join(cmd)}"')
 

From 3454ef7507000f05707418324dad392b366f8de3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 13:16:51 +0300
Subject: [PATCH 152/159] Refactor ImageLayerWriter to avoid passing a Timeline
 to finish() (#9028)

Commit ca5390a89d made a similar change to DeltaLayerWriter.

We bumped into this with Stas with our hackathon project, to create a
standalong program to create image layers directly from a Postgres data
directory. It needs to create image layers without having a Timeline and
other pageserver machinery.

This downgrades the "created image layer {}" message from INFO to TRACE
level. TRACE is used for the corresponding message on delta layer
creation too. The path logged in the message is now the temporary path,
before the file is renamed to its final name. Again commit ca5390a89d
made the same change for the message on delta layer creation.
---
 .../src/tenant/storage_layer/image_layer.rs   | 43 ++++++++-----------
 .../src/tenant/storage_layer/split_writer.rs  | 16 +++----
 pageserver/src/tenant/timeline.rs             |  9 ++--
 pageserver/src/tenant/timeline/compaction.rs  |  6 ++-
 4 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 875e223c9c..5de2582ab7 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,7 +38,7 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -58,7 +58,6 @@ use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -70,9 +69,7 @@ use utils::{
 };
 
 use super::layer_name::ImageLayerName;
-use super::{
-    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -800,10 +797,9 @@ impl ImageLayerWriterInner {
     ///
     async fn finish(
         self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Option<Key>,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -879,12 +875,9 @@ impl ImageLayerWriterInner {
         // fsync the file
         file.sync_all().await?;
 
-        // FIXME: why not carry the virtualfile here, it supports renaming?
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created image layer {}", self.path);
 
-        info!("created image layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
     }
 }
 
@@ -963,24 +956,18 @@ impl ImageLayerWriter {
     ///
     pub(crate) async fn finish(
         mut self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, None).await
     }
 
     /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
     pub(super) async fn finish_with_end_key(
         mut self,
-        timeline: &Arc<Timeline>,
         end_key: Key,
         ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, Some(end_key)).await
     }
 }
 
@@ -1084,7 +1071,7 @@ mod test {
         tenant::{
             config::TenantConf,
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::ResidentLayer,
+            storage_layer::{Layer, ResidentLayer},
             vectored_blob_io::StreamingVectoredReadPlanner,
             Tenant, Timeline,
         },
@@ -1155,7 +1142,8 @@ mod test {
 
                 key = key.next();
             }
-            writer.finish(&timeline, &ctx).await.unwrap()
+            let (desc, path) = writer.finish(&ctx).await.unwrap();
+            Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
         };
         let original_size = resident.metadata().file_size;
 
@@ -1217,7 +1205,9 @@ mod test {
                 .await
                 .unwrap();
             let replacement = if wrote_keys > 0 {
-                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+                let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
+                let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
+                Some(resident)
             } else {
                 None
             };
@@ -1290,7 +1280,8 @@ mod test {
         for (key, img) in images {
             writer.put_image(key, img, ctx).await?;
         }
-        let img_layer = writer.finish(tline, ctx).await?;
+        let (desc, path) = writer.finish(ctx).await?;
+        let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
 
         Ok::<_, anyhow::Error>(img_layer)
     }
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 40a6a77a50..b499a0eef4 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
                 self.generated_layers
                     .push(SplitWriterResult::Discarded(layer_key));
             } else {
-                self.generated_layers.push(SplitWriterResult::Produced(
-                    prev_image_writer
-                        .finish_with_end_key(tline, key, ctx)
-                        .await?,
-                ));
+                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
+
+                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                self.generated_layers
+                    .push(SplitWriterResult::Produced(layer));
             }
         }
         self.inner.put_image(key, img, ctx).await
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
         if discard(&layer_key).await {
             generated_layers.push(SplitWriterResult::Discarded(layer_key));
         } else {
-            generated_layers.push(SplitWriterResult::Produced(
-                inner.finish_with_end_key(tline, end_key, ctx).await?,
-            ));
+            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
+            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(layer));
         }
         Ok(generated_layers)
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 262dccac7d..f66491d962 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4013,7 +4013,8 @@ impl Timeline {
         if wrote_keys {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -4101,7 +4102,8 @@ impl Timeline {
         if wrote_any_image {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -5403,7 +5405,8 @@ impl Timeline {
         for (key, img) in images {
             image_layer_writer.put_image(key, img, ctx).await?;
         }
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        let (desc, path) = image_layer_writer.finish(ctx).await?;
+        let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
 
         {
             let mut guard = self.layers.write().await;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 0b5c520ba7..d1f06e3480 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -563,10 +563,12 @@ impl Timeline {
                 .await?;
 
             if keys_written > 0 {
-                let new_layer = image_layer_writer
-                    .finish(self, ctx)
+                let (desc, path) = image_layer_writer
+                    .finish(ctx)
                     .await
                     .map_err(CompactionError::Other)?;
+                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .map_err(CompactionError::Other)?;
                 tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                     layer.metadata().file_size,
                     new_layer.metadata().file_size);

From c5cd8577ff6d96e8153dd22af17373c4351e52e4 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 18 Sep 2024 13:58:51 +0200
Subject: [PATCH 153/159] proxy: make sql-over-http max request/response sizes
 configurable (#9029)

---
 proxy/src/bin/local_proxy.rs          |   8 ++
 proxy/src/bin/proxy.rs                |   8 ++
 proxy/src/config.rs                   |   2 +
 proxy/src/serverless/conn_pool.rs     |   2 +
 proxy/src/serverless/sql_over_http.rs | 114 +++++++++++++++-----------
 5 files changed, 85 insertions(+), 49 deletions(-)

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 6eba71df1b..94365ddf05 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -92,6 +92,12 @@ struct SqlOverHttpArgs {
 
     #[clap(long, default_value_t = 16)]
     sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: u64,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
 }
 
 #[tokio::main]
@@ -208,6 +214,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
         },
         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     };
 
     Ok(Box::leak(Box::new(ProxyConfig {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ca9aeb04d8..e5c5b47795 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -268,6 +268,12 @@ struct SqlOverHttpArgs {
 
     #[clap(long, default_value_t = 64)]
     sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: u64,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
 }
 
 #[tokio::main]
@@ -679,6 +685,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         },
         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     };
     let authentication_config = AuthenticationConfig {
         thread_pool,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1cda6d200c..373e4cf650 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -56,6 +56,8 @@ pub struct HttpConfig {
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
+    pub max_request_size_bytes: u64,
+    pub max_response_size_bytes: usize,
 }
 
 pub struct AuthenticationConfig {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index bea599e9b9..6c32d5df0e 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -776,6 +776,8 @@ mod tests {
             },
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
+            max_request_size_bytes: u64::MAX,
+            max_response_size_bytes: usize::MAX,
         }));
         let pool = GlobalConnPool::new(config);
         let conn_info = ConnInfo {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 2188edc8c5..06e540d149 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -87,9 +87,6 @@ enum Payload {
     Batch(BatchQueryData),
 }
 
-const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
-const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-
 static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -366,10 +363,10 @@ pub(crate) enum SqlOverHttpError {
     ConnectCompute(#[from] HttpConnError),
     #[error("{0}")]
     ConnInfo(#[from] ConnInfoError),
-    #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
-    RequestTooLarge,
-    #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
-    ResponseTooLarge,
+    #[error("request is too large (max is {0} bytes)")]
+    RequestTooLarge(u64),
+    #[error("response is too large (max is {0} bytes)")]
+    ResponseTooLarge(usize),
     #[error("invalid isolation level")]
     InvalidIsolationLevel,
     #[error("{0}")]
@@ -386,8 +383,8 @@ impl ReportableError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
             SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
             SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
-            SqlOverHttpError::RequestTooLarge => ErrorKind::User,
-            SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
+            SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User,
+            SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
             SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
             SqlOverHttpError::Postgres(p) => p.get_error_kind(),
             SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
@@ -402,8 +399,8 @@ impl UserFacingError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(p) => p.to_string(),
             SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
             SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
-            SqlOverHttpError::RequestTooLarge => self.to_string(),
-            SqlOverHttpError::ResponseTooLarge => self.to_string(),
+            SqlOverHttpError::RequestTooLarge(_) => self.to_string(),
+            SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
             SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
             SqlOverHttpError::Postgres(p) => p.to_string(),
             SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
@@ -537,7 +534,7 @@ async fn handle_inner(
 
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
-        None => MAX_REQUEST_SIZE + 1,
+        None => config.http_config.max_request_size_bytes + 1,
     };
     info!(request_content_length, "request size in bytes");
     Metrics::get()
@@ -547,8 +544,10 @@ async fn handle_inner(
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
-    if request_content_length > MAX_REQUEST_SIZE {
-        return Err(SqlOverHttpError::RequestTooLarge);
+    if request_content_length > config.http_config.max_request_size_bytes {
+        return Err(SqlOverHttpError::RequestTooLarge(
+            config.http_config.max_request_size_bytes,
+        ));
     }
 
     let fetch_and_process_request = Box::pin(
@@ -612,7 +611,10 @@ async fn handle_inner(
 
     // Now execute the query and return the result.
     let json_output = match payload {
-        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
+        Payload::Single(stmt) => {
+            stmt.process(config, cancel, &mut client, parsed_headers)
+                .await?
+        }
         Payload::Batch(statements) => {
             if parsed_headers.txn_read_only {
                 response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
@@ -628,7 +630,7 @@ async fn handle_inner(
             }
 
             statements
-                .process(cancel, &mut client, parsed_headers)
+                .process(config, cancel, &mut client, parsed_headers)
                 .await?
         }
     };
@@ -656,6 +658,7 @@ async fn handle_inner(
 impl QueryData {
     async fn process(
         self,
+        config: &'static ProxyConfig,
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
@@ -664,7 +667,7 @@ impl QueryData {
         let cancel_token = inner.cancel_token();
 
         let res = match select(
-            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
+            pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)),
             pin!(cancel.cancelled()),
         )
         .await
@@ -727,6 +730,7 @@ impl QueryData {
 impl BatchQueryData {
     async fn process(
         self,
+        config: &'static ProxyConfig,
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
@@ -751,44 +755,52 @@ impl BatchQueryData {
             discard.discard();
         })?;
 
-        let json_output =
-            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
-                Ok(json_output) => {
-                    info!("commit");
-                    let status = transaction.commit().await.inspect_err(|_| {
-                        // if we cannot commit - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                    })?;
-                    discard.check_idle(status);
-                    json_output
-                }
-                Err(SqlOverHttpError::Cancelled(_)) => {
-                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                        tracing::error!(?err, "could not cancel query");
-                    }
-                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+        let json_output = match query_batch(
+            config,
+            cancel.child_token(),
+            &transaction,
+            self,
+            parsed_headers,
+        )
+        .await
+        {
+            Ok(json_output) => {
+                info!("commit");
+                let status = transaction.commit().await.inspect_err(|_| {
+                    // if we cannot commit - for now don't return connection to pool
+                    // TODO: get a query status from the error
                     discard.discard();
+                })?;
+                discard.check_idle(status);
+                json_output
+            }
+            Err(SqlOverHttpError::Cancelled(_)) => {
+                if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                    tracing::error!(?err, "could not cancel query");
+                }
+                // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+                discard.discard();
 
-                    return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-                }
-                Err(err) => {
-                    info!("rollback");
-                    let status = transaction.rollback().await.inspect_err(|_| {
-                        // if we cannot rollback - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                    })?;
-                    discard.check_idle(status);
-                    return Err(err);
-                }
-            };
+                return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+            }
+            Err(err) => {
+                info!("rollback");
+                let status = transaction.rollback().await.inspect_err(|_| {
+                    // if we cannot rollback - for now don't return connection to pool
+                    // TODO: get a query status from the error
+                    discard.discard();
+                })?;
+                discard.check_idle(status);
+                return Err(err);
+            }
+        };
 
         Ok(json_output)
     }
 }
 
 async fn query_batch(
+    config: &'static ProxyConfig,
     cancel: CancellationToken,
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
@@ -798,6 +810,7 @@ async fn query_batch(
     let mut current_size = 0;
     for stmt in queries.queries {
         let query = pin!(query_to_json(
+            config,
             transaction,
             stmt,
             &mut current_size,
@@ -826,6 +839,7 @@ async fn query_batch(
 }
 
 async fn query_to_json<T: GenericClient>(
+    config: &'static ProxyConfig,
     client: &T,
     data: QueryData,
     current_size: &mut usize,
@@ -846,8 +860,10 @@ async fn query_to_json<T: GenericClient>(
         rows.push(row);
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
-        if *current_size > MAX_RESPONSE_SIZE {
-            return Err(SqlOverHttpError::ResponseTooLarge);
+        if *current_size > config.http_config.max_response_size_bytes {
+            return Err(SqlOverHttpError::ResponseTooLarge(
+                config.http_config.max_response_size_bytes,
+            ));
         }
     }
 

From e161a2fa424e392c74a5f76e8964227e11f72e9c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 18 Sep 2024 14:26:47 +0100
Subject: [PATCH 154/159] CI(deploy): fix deploy to staging and prod (#9030)

## Problem

It turns out the previous approach (with `skip_if` input) doesn't work
(from https://github.com/neondatabase/neon/pull/9017).
Revert it and use more straightforward if-conditions

## Summary of changes
- Revert efbe8db7f1bd7775f7dff276ee95fd187cd33c2c
- Add if-condition to`promote-compatibility-data` job and relevant
comments
---
 .github/workflows/_push-to-acr.yml   |  6 ------
 .github/workflows/build_and_test.yml | 10 ++++++----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
index 7b6eba2c06..c304172ff7 100644
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -26,15 +26,9 @@ on:
         description: Azure tenant ID
         required: true
         type: string
-      skip_if:
-        description: Skip the job if this expression is true
-        required: true
-        type: boolean
 
 jobs:
   push-to-acr:
-    if: ${{ !inputs.skip_if }}
-
     runs-on: ubuntu-22.04
     permissions:
       contents: read  # This is required for actions/checkout
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d46b8dc1f5..a210c962cb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -849,6 +849,7 @@ jobs:
           done
 
   push-to-acr-dev:
+    if: github.ref_name == 'main'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -858,9 +859,9 @@ jobs:
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
-      skip_if: ${{ github.ref_name != 'main' }}
 
   push-to-acr-prod:
+    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -870,7 +871,6 @@ jobs:
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
-      skip_if: ${{ !startsWith(github.ref_name, 'release') }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
@@ -948,7 +948,8 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1046,7 +1047,8 @@ jobs:
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]
-    if: github.ref_name == 'release'
+    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
+    if: github.ref_name == 'release' && !failure() && !cancelled()
 
     runs-on: ubuntu-22.04
     steps:

From 2f37f0384c1d122fe23512be3b9518288f69b5fe Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 17 Sep 2024 12:28:29 -0500
Subject: [PATCH 155/159] Add v17 to revisions.json

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/revisions.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3a65a507f3..c2c34962bb 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,8 @@
 {
+  "v17": [
+    "17rc1",
+    "9156d63ce253bed9d1f76355ceec610e444eaffa"
+  ],
   "v16": [
     "16.4",
     "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"

From ac6a1151aeaf8fcc007b833a97dca57328c44d05 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 19:08:47 +0100
Subject: [PATCH 156/159] test_postgres_version: reenable version check for
 prereleased versions

---
 test_runner/regress/test_postgres_version.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py
index 4145a303c6..d8626c15a5 100644
--- a/test_runner/regress/test_postgres_version.py
+++ b/test_runner/regress/test_postgres_version.py
@@ -30,9 +30,8 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
     version = match.group("version")
     commit = match.group("commit")
 
-    if "." in version:
-        assert (
-            pg_version.v_prefixed in expected_revisions
-        ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
-        msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
-        assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
+    assert (
+        pg_version.v_prefixed in expected_revisions
+    ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
+    msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
+    assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg

From 794bd4b866a385a104e257deae438581b0fd250d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 18 Sep 2024 17:14:53 +0200
Subject: [PATCH 157/159] proxy: mock cplane usable without allowed-ips table
 (#9046)

---
 proxy/src/bin/proxy.rs             | 21 ++++++++-------
 proxy/src/console/provider/mock.rs | 42 ++++++++++++++++++------------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index e5c5b47795..2ac66ffe8c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -62,12 +62,13 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
     Console,
-    #[cfg(feature = "testing")]
-    Postgres,
     // clap only shows the name, not the alias, in usage text.
     // TODO: swap name/alias and deprecate "link"
     #[value(name("link"), alias("web"))]
     Web,
+
+    #[cfg(feature = "testing")]
+    Postgres,
 }
 
 /// Neon proxy/router
@@ -639,17 +640,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let api = console::provider::ConsoleBackend::Console(api);
             auth::Backend::Console(MaybeOwned::Owned(api), ())
         }
-        #[cfg(feature = "testing")]
-        AuthBackendType::Postgres => {
-            let url = args.auth_endpoint.parse()?;
-            let api = console::provider::mock::Api::new(url);
-            let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::Backend::Console(MaybeOwned::Owned(api), ())
-        }
+
         AuthBackendType::Web => {
             let url = args.uri.parse()?;
             auth::Backend::Web(MaybeOwned::Owned(url), ())
         }
+
+        #[cfg(feature = "testing")]
+        AuthBackendType::Postgres => {
+            let url = args.auth_endpoint.parse()?;
+            let api = console::provider::mock::Api::new(url, !args.is_private_access_proxy);
+            let api = console::provider::ConsoleBackend::Postgres(api);
+            auth::Backend::Console(MaybeOwned::Owned(api), ())
+        }
     };
 
     let config::ConcurrencyLockOptions {
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 08b87cd87a..1b77418de6 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -41,11 +41,15 @@ impl From<tokio_postgres::Error> for ApiError {
 #[derive(Clone)]
 pub struct Api {
     endpoint: ApiUrl,
+    ip_allowlist_check_enabled: bool,
 }
 
 impl Api {
-    pub fn new(endpoint: ApiUrl) -> Self {
-        Self { endpoint }
+    pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
+        Self {
+            endpoint,
+            ip_allowlist_check_enabled,
+        }
     }
 
     pub(crate) fn url(&self) -> &str {
@@ -64,6 +68,7 @@ impl Api {
                 tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
             tokio::spawn(connection);
+
             let secret = if let Some(entry) = get_execute_postgres_query(
                 &client,
                 "select rolpassword from pg_catalog.pg_authid where rolname = $1",
@@ -79,21 +84,26 @@ impl Api {
                 warn!("user '{}' does not exist", user_info.user);
                 None
             };
-            let allowed_ips = match get_execute_postgres_query(
-                &client,
-                "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
-                &[&user_info.endpoint.as_str()],
-                "allowed_ips",
-            )
-            .await?
-            {
-                Some(s) => {
-                    info!("got allowed_ips: {s}");
-                    s.split(',')
-                        .map(|s| IpPattern::from_str(s).unwrap())
-                        .collect()
+
+            let allowed_ips = if self.ip_allowlist_check_enabled {
+                match get_execute_postgres_query(
+                    &client,
+                    "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
+                    &[&user_info.endpoint.as_str()],
+                    "allowed_ips",
+                )
+                .await?
+                {
+                    Some(s) => {
+                        info!("got allowed_ips: {s}");
+                        s.split(',')
+                            .map(|s| IpPattern::from_str(s).unwrap())
+                            .collect()
+                    }
+                    None => vec![],
                 }
-                None => vec![],
+            } else {
+                vec![]
             };
 
             Ok((secret, allowed_ips))

From 035a49a6b2e9ffa35dbf94d78042c147e83e896f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 18 Sep 2024 18:17:55 +0200
Subject: [PATCH 158/159] `neon_local start`: parallel startup to break cyclic
 dependency (#8950)

(Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.)

Problem
-------

Before this PR, `neon_local` sequentially does the following:
1. launch storcon process
2. wait for storcon to signal readiness
[here](https://github.com/neondatabase/neon/blob/75310fe441b87d399213e365f1364aa9f08aa40d/control_plane/src/storage_controller.rs#L804-L808)
3. start pageserver
4. wait for pageserver to become ready
[here](https://github.com/neondatabase/neon/blob/c43e664ff577d4568722e4e7a2b2c6267b609607/control_plane/src/pageserver.rs#L343-L346)
5. etc

The problem is that storcon's readiness waits for the
[`startup_reconcile`](https://github.com/neondatabase/neon/blob/cbcd4058edb7a2c2bb3bfe1a6fc1ffb0d820b870/storage_controller/src/service.rs#L520-L523)
to complete.

But pageservers aren't started at this point.

So, worst case we wait for `STARTUP_RECONCILE_TIMEOUT/2`, i.e., 15s.

This is more than the 10s default timeout allowed by neon_local.

So, the result is that `neon_local start` fails to start storcon and
stops everything.

Solution
--------

In this PR I choose the the radical solution to start everything in
parallel.

It junks up the output because we do stuff like `print!(".")` to
indicate progress.
We should just abandon that.
And switch to `utils::logging` + `tracing` with separate spans for each
component.
I can do that in this PR or we leave it as a follow-up.

Alternatives Considered
-----------------------

The Pageserver's `/v1/status` or in fact any endpoint of the mgmt API
will not `accept()` on the mgmt API socket until after the `re-attach`
call to storcon returned success.

So, it's insufficient to change the startup order to start Pageservers
first.

We cannot easily change Pageserver startup order because
`init_tenant_mgr` must complete before we start serving the mgmt API.
Otherwise tenant detach calls et al can race with `init_tenant_mgr`.

We'd have to add a "loading" state to tenant mgr and make all API
endpoints except `/v1/status` wait for _that_ to complete.


Related
-------

- https://github.com/neondatabase/neon/pull/6475
---
 control_plane/src/background_process.rs |   2 +-
 control_plane/src/bin/neon_local.rs     | 145 ++++++++++++++++--------
 control_plane/src/endpoint.rs           |   2 +-
 3 files changed, 99 insertions(+), 50 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 619c5bce3e..94a072e394 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -151,7 +151,7 @@ where
                     print!(".");
                     io::stdout().flush().unwrap();
                 }
-                thread::sleep(RETRY_INTERVAL);
+                tokio::time::sleep(RETRY_INTERVAL).await;
             }
             Err(e) => {
                 println!("error starting process {process_name:?}: {e:#}");
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1c94c42865..92f609761a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -34,12 +34,14 @@ use safekeeper_api::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
+use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
+use tokio::task::JoinSet;
 use url::Host;
 use utils::{
     auth::{Claims, Scope},
@@ -87,35 +89,35 @@ fn main() -> Result<()> {
 
     // Check for 'neon init' command first.
     let subcommand_result = if sub_name == "init" {
-        handle_init(sub_args).map(Some)
+        handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
     } else {
         // all other commands need an existing config
-        let mut env =
-            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
-        let original_env = env.clone();
 
+        let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
+        let original_env = env.clone();
+        let env = Box::leak(Box::new(env));
         let rt = tokio::runtime::Builder::new_current_thread()
             .enable_all()
             .build()
             .unwrap();
 
         let subcommand_result = match sub_name {
-            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
-            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
-            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
-            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
-            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, &env)),
-            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
-            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
-            "mappings" => handle_mappings(sub_args, &mut env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, env)),
+            "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
+            "stop" => rt.block_on(handle_stop_all(sub_args, env)),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
+            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
+            "mappings" => handle_mappings(sub_args, env),
             "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
             _ => bail!("unexpected subcommand {sub_name}"),
         };
 
-        if original_env != env {
-            subcommand_result.map(|()| Some(env))
+        if &original_env != env {
+            subcommand_result.map(|()| Some(Cow::Borrowed(env)))
         } else {
             subcommand_result.map(|()| None)
         }
@@ -1273,48 +1275,95 @@ async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv
 }
 
 async fn handle_start_all(
-    env: &local_env::LocalEnv,
+    env: &'static local_env::LocalEnv,
     retry_timeout: &Duration,
 ) -> anyhow::Result<()> {
+    let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
+        neon_start_status_check(env, retry_timeout)
+            .await
+            .context("status check after successful startup of all services")?;
+        return Ok(());
+    };
+
+    eprintln!("startup failed because one or more services could not be started");
+
+    for e in errors {
+        eprintln!("{e}");
+        let debug_repr = format!("{e:?}");
+        for line in debug_repr.lines() {
+            eprintln!("  {line}");
+        }
+    }
+
+    try_stop_all(env, true).await;
+
+    exit(2);
+}
+
+/// Returns Ok() if and only if all services could be started successfully.
+/// Otherwise, returns the list of errors that occurred during startup.
+async fn handle_start_all_impl(
+    env: &'static local_env::LocalEnv,
+    retry_timeout: Duration,
+) -> Result<(), Vec<anyhow::Error>> {
     // Endpoints are not started automatically
 
-    broker::start_broker_process(env, retry_timeout).await?;
+    let mut js = JoinSet::new();
 
-    // Only start the storage controller if the pageserver is configured to need it
-    if env.control_plane_api.is_some() {
-        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller
-            .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                (*retry_timeout).into(),
-            ))
-            .await
-        {
-            eprintln!("storage_controller start failed: {:#}", e);
-            try_stop_all(env, true).await;
-            exit(1);
+    // force infalliblity through closure
+    #[allow(clippy::redundant_closure_call)]
+    (|| {
+        js.spawn(async move {
+            let retry_timeout = retry_timeout;
+            broker::start_broker_process(env, &retry_timeout).await
+        });
+
+        // Only start the storage controller if the pageserver is configured to need it
+        if env.control_plane_api.is_some() {
+            js.spawn(async move {
+                let storage_controller = StorageController::from_env(env);
+                storage_controller
+                    .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                        retry_timeout.into(),
+                    ))
+                    .await
+                    .map_err(|e| e.context("start storage_controller"))
+            });
+        }
+
+        for ps_conf in &env.pageservers {
+            js.spawn(async move {
+                let pageserver = PageServerNode::from_env(env, ps_conf);
+                pageserver
+                    .start(&retry_timeout)
+                    .await
+                    .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
+            });
+        }
+
+        for node in env.safekeepers.iter() {
+            js.spawn(async move {
+                let safekeeper = SafekeeperNode::from_env(env, node);
+                safekeeper
+                    .start(vec![], &retry_timeout)
+                    .await
+                    .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
+            });
+        }
+    })();
+
+    let mut errors = Vec::new();
+    while let Some(result) = js.join_next().await {
+        let result = result.expect("we don't panic or cancel the tasks");
+        if let Err(e) = result {
+            errors.push(e);
         }
     }
 
-    for ps_conf in &env.pageservers {
-        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(retry_timeout).await {
-            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true).await;
-            exit(1);
-        }
+    if !errors.is_empty() {
+        return Err(errors);
     }
 
-    for node in env.safekeepers.iter() {
-        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
-            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false).await;
-            exit(1);
-        }
-    }
-
-    neon_start_status_check(env, retry_timeout).await?;
-
     Ok(())
 }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 9f879c4b08..7554a03a68 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -702,7 +702,7 @@ impl Endpoint {
                     }
                 }
             }
-            std::thread::sleep(ATTEMPT_INTERVAL);
+            tokio::time::sleep(ATTEMPT_INTERVAL).await;
         }
 
         // disarm the scopeguard, let the child outlive this function (and neon_local invoction)

From 9d1c6f23d312f43ecd0d7134d7566020626548b2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 18 Sep 2024 14:13:57 -0400
Subject: [PATCH 159/159] fix(storage-scrubber): log version after initialize
 the logger (#9049)

When I checked the log in Grafana I couldn't find the scrubber version.
Then I realized that it should be logged after the logger gets
initialized.

## Summary of changes

Log after initializing the logger for the scrubber.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/main.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index c5961753c5..ee133e2e58 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -121,8 +121,6 @@ enum Command {
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
 
-    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
-
     let bucket_config = BucketConfig::from_env()?;
 
     let command_log_name = match &cli.command {
@@ -142,6 +140,8 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
+    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
+
     let controller_client = cli.controller_api.map(|controller_api| {
         ControllerClientConfig {
             controller_api,