From c7187be8a11a43a0bc74d8745912df4a6c5c1db7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 7 Aug 2024 19:26:06 +0300
Subject: [PATCH 001/142] safekeeper: check for non-consecutive writes in
 safekeeper.rs

wal_storage.rs already checks this, but since this is a quite legit scenario
check it at safekeeper.rs (consensus level) as well.

ref https://github.com/neondatabase/neon/issues/8212

This is a take 2; previous PR #8640 had been reverted because interplay
with another change broke test_last_log_term_switch.
---
 safekeeper/src/safekeeper.rs                  | 126 ++++++++++++++----
 safekeeper/src/wal_storage.rs                 |   6 +
 .../tests/walproposer_sim/safekeeper_disk.rs  |   4 +
 3 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 486954c7b9..dbe0034de2 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,6 +875,29 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
+        // Disallow any non-sequential writes, which can result in gaps or
+        // overwrites. If we need to move the pointer, ProposerElected message
+        // should have truncated WAL first accordingly. Note that the first
+        // condition (WAL rewrite) is quite expected in real world; it happens
+        // when walproposer reconnects to safekeeper and writes some more data
+        // while first connection still gets some packets later. It might be
+        // better to not log this as error! above.
+        let write_lsn = self.wal_store.write_lsn();
+        if write_lsn > msg.h.begin_lsn {
+            bail!(
+                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn
+            );
+        }
+        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
+            bail!(
+                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn,
+            );
+        }
+
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -960,10 +983,7 @@ mod tests {
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
 
     use super::*;
-    use crate::{
-        state::{EvictionState, PersistedPeers, TimelinePersistentState},
-        wal_storage::Storage,
-    };
+    use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
     use std::{ops::Deref, str::FromStr, time::Instant};
 
     // fake storage for tests
@@ -1003,6 +1023,10 @@ mod tests {
     }
 
     impl wal_storage::Storage for DummyWalStore {
+        fn write_lsn(&self) -> Lsn {
+            self.lsn
+        }
+
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
@@ -1076,7 +1100,7 @@ mod tests {
         let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
 
         let mut ar_hdr = AppendRequestHeader {
-            term: 1,
+            term: 2,
             term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
@@ -1090,24 +1114,29 @@ mod tests {
         };
 
         let pem = ProposerElected {
-            term: 1,
-            start_streaming_at: Lsn(3),
-            term_history: TermHistory(vec![TermLsn {
-                term: 1,
-                lsn: Lsn(3),
-            }]),
-            timeline_start_lsn: Lsn(0),
+            term: 2,
+            start_streaming_at: Lsn(1),
+            term_history: TermHistory(vec![
+                TermLsn {
+                    term: 1,
+                    lsn: Lsn(1),
+                },
+                TermLsn {
+                    term: 2,
+                    lsn: Lsn(3),
+                },
+            ]),
+            timeline_start_lsn: Lsn(1),
         };
         sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
             .await
             .unwrap();
 
         // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
-        assert!(resp.is_ok());
-        assert_eq!(sk.get_last_log_term(), 0);
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        assert_eq!(sk.get_last_log_term(), 1);
 
         // but record at term_start_lsn does the switch
         ar_hdr.begin_lsn = Lsn(2);
@@ -1116,12 +1145,63 @@ mod tests {
             h: ar_hdr,
             wal_data: Bytes::from_static(b"b"),
         };
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
-        assert!(resp.is_ok());
-        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_last_log_term(), 1);
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        assert_eq!(sk.get_last_log_term(), 2);
+    }
+
+    #[tokio::test]
+    async fn test_non_consecutive_write() {
+        let storage = InMemoryState {
+            persisted_state: test_sk_state(),
+        };
+        let wal_store = DummyWalStore { lsn: Lsn(0) };
+
+        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
+
+        let pem = ProposerElected {
+            term: 1,
+            start_streaming_at: Lsn(1),
+            term_history: TermHistory(vec![TermLsn {
+                term: 1,
+                lsn: Lsn(1),
+            }]),
+            timeline_start_lsn: Lsn(1),
+        };
+        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
+            .await
+            .unwrap();
+
+        let ar_hdr = AppendRequestHeader {
+            term: 1,
+            term_start_lsn: Lsn(3),
+            begin_lsn: Lsn(1),
+            end_lsn: Lsn(2),
+            commit_lsn: Lsn(0),
+            truncate_lsn: Lsn(0),
+            proposer_uuid: [0; 16],
+        };
+        let append_request = AppendRequest {
+            h: ar_hdr.clone(),
+            wal_data: Bytes::from_static(b"b"),
+        };
+
+        // do write ending at 2, it should be ok
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap();
+        let mut ar_hrd2 = ar_hdr.clone();
+        ar_hrd2.begin_lsn = Lsn(4);
+        ar_hrd2.end_lsn = Lsn(5);
+        let append_request = AppendRequest {
+            h: ar_hdr,
+            wal_data: Bytes::from_static(b"b"),
+        };
+        // and now starting at 4, it must fail
+        sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
+            .await
+            .unwrap_err();
     }
 
     #[test]
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6fd7c91a68..89c2e98a94 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,6 +37,8 @@ use pq_proto::SystemId;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 pub trait Storage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -329,6 +331,10 @@ impl PhysicalStorage {
 }
 
 impl Storage for PhysicalStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index 6b31edb1f2..b854754ecf 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -175,6 +175,10 @@ impl DiskWALStorage {
 }
 
 impl wal_storage::Storage for DiskWALStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From c4fe6641c1695b1d7c450358af2cec6018fb2359 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Sep 2024 18:16:49 +0100
Subject: [PATCH 002/142] pageserver: separate metadata and data pages in
 DatadirModification (#8621)

## Problem

Currently, DatadirModification keeps a key-indexed map of all pending
writes, even though we (almost) never need to read back dirty pages for
anything other than metadata pages (e.g. relation sizes).

Related: https://github.com/neondatabase/neon/issues/6345

## Summary of changes

- commit() modifications before ingesting database creation wal records,
so that they are guaranteed to be able to get() everything they need
directly from the underlying Timeline.
- Split dirty pages in DatadirModification into pending_metadata_pages
and pending_data_pages. The data ones don't need to be in a
key-addressable format, so they just go in a Vec instead.
- Special case handling of zero-page writes in DatadirModification,
putting them in a map which is flushed on the end of a WAL record. This
handles the case where during ingest, we might first write a zero page,
and then ingest a postgres write to that page. We used to do this via
the key-indexed map of writes, but in this PR we change the data page
write path to not bother indexing these by key.

My least favorite thing about this PR is that I needed to change the
DatadirModification interface to add the on_record_end call. This is not
very invasive because there's really only one place we use it, but it
changes the object's behaviour from being clearly an aggregation of many
records to having some per-record state. I could avoid this by
implicitly doing the work when someone calls set_lsn or commit -- I'm
open to opinions on whether that's cleaner or dirtier.

## Performance

There may be some efficiency improvement here, but the primary
motivation is to enable an earlier stage of ingest to operate without
access to a Timeline. The `pending_data_pages` part is the "fast path"
bulk write data that can in principle be generated without a Timeline,
in parallel with other ingest batches, and ultimately on the safekeeper.

`test_bulk_insert` on AX102 shows approximately the same results as in
the previous PR #8591:

```
------------------------------ Benchmark results -------------------------------
test_bulk_insert[neon-release-pg16].insert: 23.577 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 637 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 18.264 s
test_bulk_insert[neon-release-pg16].compaction: 0.052 s
```
---
 pageserver/src/import_datadir.rs              |  12 +-
 pageserver/src/pgdatadir_mapping.rs           | 228 +++++++++++++-----
 .../tenant/storage_layer/inmemory_layer.rs    |   9 +-
 .../walreceiver/walreceiver_connection.rs     |  64 ++++-
 pageserver/src/walingest.rs                   |  42 +++-
 pageserver/src/walrecord.rs                   |  24 ++
 6 files changed, 281 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ed409d3130..5a0894cd1b 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
+use crate::walrecord::decode_wal_record;
 use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
@@ -310,11 +311,13 @@ async fn import_wal(
 
         let mut nrecords = 0;
         let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                     .await?;
                 WAL_INGEST.records_committed.inc();
 
@@ -449,11 +452,12 @@ pub async fn import_wal_from_tar(
         waldecoder.feed_bytes(&bytes[offset..]);
 
         let mut modification = tline.begin_modification(last_lsn);
-        let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
+                    .ingest_record(decoded, lsn, &mut modification, ctx)
                     .await?;
                 modification.commit(ctx).await?;
                 last_lsn = lsn;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index edcbac970b..c26abca1f7 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
-use anyhow::{bail, ensure, Context};
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use pageserver_api::key::{
@@ -168,7 +168,9 @@ impl Timeline {
         DatadirModification {
             tline: self,
             pending_lsns: Vec::new(),
-            pending_updates: HashMap::new(),
+            pending_metadata_pages: HashMap::new(),
+            pending_data_pages: Vec::new(),
+            pending_zero_data_pages: Default::default(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
             pending_directory_entries: Vec::new(),
@@ -1031,10 +1033,24 @@ pub struct DatadirModification<'a> {
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
     pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
+    /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
+    /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
+    pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
+
+    /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
+    /// which keys are stored here.
+    pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
+
+    // Sometimes during ingest, for example when extending a relation, we would like to write a zero page.  However,
+    // if we encounter a write from postgres in the same wal record, we will drop this entry.
+    //
+    // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
+    // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
+    pending_zero_data_pages: HashSet<CompactKey>,
+
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1058,6 +1074,10 @@ impl<'a> DatadirModification<'a> {
         self.pending_bytes
     }
 
+    pub(crate) fn has_dirty_data_pages(&self) -> bool {
+        (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -1066,6 +1086,10 @@ impl<'a> DatadirModification<'a> {
             lsn,
             self.lsn
         );
+
+        // If we are advancing LSN, then state from previous wal record should have been flushed.
+        assert!(self.pending_zero_data_pages.is_empty());
+
         if lsn > self.lsn {
             self.pending_lsns.push(self.lsn);
             self.lsn = lsn;
@@ -1073,6 +1097,17 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
+    /// keys that represent literal blocks that postgres can read.  So data includes relation blocks and
+    /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
+    ///
+    /// The distinction is important because data keys are handled on a fast path where dirty writes are
+    /// not readable until this modification is committed, whereas metadata keys are visible for read
+    /// via [`Self::get`] as soon as their record has been ingested.
+    fn is_data_key(key: &Key) -> bool {
+        key.is_rel_block_key() || key.is_slru_block_key()
+    }
+
     /// Initialize a completely new repository.
     ///
     /// This inserts the directory metadata entries that are assumed to
@@ -1180,6 +1215,31 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
+        self.pending_zero_data_pages
+            .insert(rel_block_to_key(rel, blknum).to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+    }
+
+    pub(crate) fn put_slru_page_image_zero(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        blknum: BlockNumber,
+    ) {
+        self.pending_zero_data_pages
+            .insert(slru_block_to_key(kind, segno, blknum).to_compact());
+        self.pending_bytes += ZERO_PAGE.len();
+    }
+
+    /// Call this at the end of each WAL record.
+    pub(crate) fn on_record_end(&mut self) {
+        let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
+        for key in pending_zero_data_pages {
+            self.put_data(key, Value::Image(ZERO_PAGE.clone()));
+        }
+    }
+
     /// Store a relmapper file (pg_filenode.map) in the repository
     pub async fn put_relmap_file(
         &mut self,
@@ -1778,7 +1838,7 @@ impl<'a> DatadirModification<'a> {
     /// retains all the metadata, but data pages are flushed. That's again OK
     /// for bulk import, where you are just loading data pages and won't try to
     /// modify the same pages twice.
-    pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         // Unless we have accumulated a decent amount of changes, it's not worth it
         // to scan through the pending_updates list.
         let pending_nblocks = self.pending_nblocks;
@@ -1789,31 +1849,11 @@ impl<'a> DatadirModification<'a> {
         let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
-        for (key, values) in self.pending_updates.drain() {
-            if !key.is_valid_key_on_write_path() {
-                bail!(
-                    "the request contains data not supported by pageserver at TimelineWriter::put: {}", key
-                );
-            }
-            let mut write_batch = Vec::new();
-            for (lsn, value_ser_size, value) in values {
-                if key.is_rel_block_key() || key.is_slru_block_key() {
-                    // This bails out on first error without modifying pending_updates.
-                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
-                } else {
-                    retained_pending_updates.entry(key).or_default().push((
-                        lsn,
-                        value_ser_size,
-                        value,
-                    ));
-                }
-            }
-            writer.put_batch(write_batch, ctx).await?;
-        }
+        let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
 
-        self.pending_updates = retained_pending_updates;
+        // This bails out on first error without modifying pending_updates.
+        // That's Ok, cf this function's doc comment.
+        writer.put_batch(pending_data_pages, ctx).await?;
         self.pending_bytes = 0;
 
         if pending_nblocks != 0 {
@@ -1834,29 +1874,31 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+        // Commit should never be called mid-wal-record
+        assert!(self.pending_zero_data_pages.is_empty());
+
         let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
-        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
-                .pending_updates
+        // Ordering: the items in this batch do not need to be in any global order, but values for
+        // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+        // this to do efficient updates to its index.
+        let mut write_batch = std::mem::take(&mut self.pending_data_pages);
+
+        write_batch.extend(
+            self.pending_metadata_pages
                 .drain()
                 .flat_map(|(key, values)| {
-                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        if !key.is_valid_key_on_write_path() {
-                            bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
-                        }
-                        Ok((key.to_compact(), lsn, val_ser_size, value))
-                    })
-                })
-                .collect::<anyhow::Result<Vec<_>>>()?;
+                    values
+                        .into_iter()
+                        .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
+                }),
+        );
 
-            writer.put_batch(batch, ctx).await?;
+        if !write_batch.is_empty() {
+            writer.put_batch(write_batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
@@ -1887,33 +1929,58 @@ impl<'a> DatadirModification<'a> {
     }
 
     pub(crate) fn len(&self) -> usize {
-        self.pending_updates.len() + self.pending_deletions.len()
+        self.pending_metadata_pages.len()
+            + self.pending_data_pages.len()
+            + self.pending_deletions.len()
     }
 
-    // Internal helper functions to batch the modifications
-
+    /// Read a page from the Timeline we are writing to.  For metadata pages, this passes through
+    /// a cache in Self, which makes writes earlier in this modification visible to WAL records later
+    /// in the modification.
+    ///
+    /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
+    /// page must ensure that the pages they read are already committed in Timeline, for example
+    /// DB create operations are always preceded by a call to commit().  This is special cased because
+    /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
+    /// and not data pages.
     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the latest pending updated
-        // version in that case.
-        //
-        // Note: we don't check pending_deletions. It is an error to request a
-        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, _, value)) = values.last() {
-                return if let Value::Image(img) = value {
-                    Ok(img.clone())
-                } else {
-                    // Currently, we never need to read back a WAL record that we
-                    // inserted in the same "transaction". All the metadata updates
-                    // work directly with Images, and we never need to read actual
-                    // data pages. We could handle this if we had to, by calling
-                    // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::Other(anyhow::anyhow!(
-                        "unexpected pending WAL record"
-                    )))
-                };
+        if !Self::is_data_key(&key) {
+            // Have we already updated the same key? Read the latest pending updated
+            // version in that case.
+            //
+            // Note: we don't check pending_deletions. It is an error to request a
+            // value that has been removed, deletion only avoids leaking storage.
+            if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
+                if let Some((_, _, value)) = values.last() {
+                    return if let Value::Image(img) = value {
+                        Ok(img.clone())
+                    } else {
+                        // Currently, we never need to read back a WAL record that we
+                        // inserted in the same "transaction". All the metadata updates
+                        // work directly with Images, and we never need to read actual
+                        // data pages. We could handle this if we had to, by calling
+                        // the walredo manager, but let's keep it simple for now.
+                        Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "unexpected pending WAL record"
+                        )))
+                    };
+                }
+            }
+        } else {
+            // This is an expensive check, so we only do it in debug mode. If reading a data key,
+            // this key should never be present in pending_data_pages. We ensure this by committing
+            // modifications before ingesting DB create operations, which are the only kind that reads
+            // data pages during ingest.
+            if cfg!(debug_assertions) {
+                for (dirty_key, _, _, _) in &self.pending_data_pages {
+                    debug_assert!(&key.to_compact() != dirty_key);
+                }
+
+                debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
             }
         }
+
+        // Metadata page cache miss, or we're reading a data page.
         let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
         self.tline.get(key, lsn, ctx).await
     }
@@ -1925,11 +1992,40 @@ impl<'a> DatadirModification<'a> {
     }
 
     fn put(&mut self, key: Key, val: Value) {
-        let values = self.pending_updates.entry(key).or_default();
+        if Self::is_data_key(&key) {
+            self.put_data(key.to_compact(), val)
+        } else {
+            self.put_metadata(key.to_compact(), val)
+        }
+    }
+
+    fn put_data(&mut self, key: CompactKey, val: Value) {
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+
+        // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write.  This
+        // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
+        // and the subsequent postgres-originating write
+        if self.pending_zero_data_pages.remove(&key) {
+            self.pending_bytes -= ZERO_PAGE.len();
+        }
+
+        self.pending_bytes += val_serialized_size;
+        self.pending_data_pages
+            .push((key, self.lsn, val_serialized_size, val))
+    }
+
+    fn put_metadata(&mut self, key: CompactKey, val: Value) {
+        let values = self.pending_metadata_pages.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
         if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
             if *last_lsn == self.lsn {
+                // Update the pending_bytes contribution from this entry, and update the serialized size in place
+                self.pending_bytes -= *last_value_ser_size;
                 *last_value_ser_size = val.serialized_size().unwrap() as usize;
+                self.pending_bytes += *last_value_ser_size;
+
+                // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
+                // have been generated by synthesized zero page writes prior to the first real write to a page.
                 *last_value = val;
                 return;
             }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index f31ab4b1e8..2c19e5b19f 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -692,8 +692,13 @@ impl InMemoryLayer {
             let vec_map = inner.index.entry(key).or_default();
             let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
             if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!("Key {} at {} already exists", key, lsn);
+                // This should not break anything, but is unexpected: ingestion code aims to filter out
+                // multiple writes to the same key at the same LSN.  This happens in cases where our
+                // ingenstion code generates some write like an empty page, and we see a write from postgres
+                // to the same key in the same wal record.  If one such write makes it through, we
+                // index the most recent write, implicitly ignoring the earlier write.  We log a warning
+                // because this case is unexpected, and we would like tests to fail if this happens.
+                warn!("Key {} at {} written twice at same LSN", key, lsn);
             }
         }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 0114473eda..cee259e2e0 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -31,7 +31,7 @@ use crate::{
     task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
-    walrecord::DecodedWALRecord,
+    walrecord::{decode_wal_record, DecodedWALRecord},
 };
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
@@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection(
                 waldecoder.feed_bytes(data);
 
                 {
-                    let mut decoded = DecodedWALRecord::default();
                     let mut modification = timeline.begin_modification(startlsn);
                     let mut uncommitted_records = 0;
                     let mut filtered_records = 0;
+
+                    async fn commit(
+                        modification: &mut DatadirModification<'_>,
+                        uncommitted: &mut u64,
+                        filtered: &mut u64,
+                        ctx: &RequestContext,
+                    ) -> anyhow::Result<()> {
+                        WAL_INGEST
+                            .records_committed
+                            .inc_by(*uncommitted - *filtered);
+                        modification.commit(ctx).await?;
+                        *uncommitted = 0;
+                        *filtered = 0;
+                        Ok(())
+                    }
+
                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
@@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
+                        // Deserialize WAL record
+                        let mut decoded = DecodedWALRecord::default();
+                        decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
+
+                        if decoded.is_dbase_create_copy(timeline.pg_version)
+                            && uncommitted_records > 0
+                        {
+                            // Special case: legacy PG database creations operate by reading pages from a 'template' database:
+                            // these are the only kinds of WAL record that require reading data blocks while ingesting.  Ensure
+                            // all earlier writes of data blocks are visible by committing any modification in flight.
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
+                        }
+
                         // Ingest the records without immediately committing them.
                         let ingested = walingest
-                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                            .ingest_record(decoded, lsn, &mut modification, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
                         if !ingested {
@@ -349,21 +383,25 @@ pub(super) async fn handle_walreceiver_connection(
                             || modification.approx_pending_bytes()
                                 > DatadirModification::MAX_PENDING_BYTES
                         {
-                            WAL_INGEST
-                                .records_committed
-                                .inc_by(uncommitted_records - filtered_records);
-                            modification.commit(&ctx).await?;
-                            uncommitted_records = 0;
-                            filtered_records = 0;
+                            commit(
+                                &mut modification,
+                                &mut uncommitted_records,
+                                &mut filtered_records,
+                                &ctx,
+                            )
+                            .await?;
                         }
                     }
 
                     // Commit the remaining records.
                     if uncommitted_records > 0 {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(uncommitted_records - filtered_records);
-                        modification.commit(&ctx).await?;
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
                     }
                 }
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8ccd20adb1..2d3841881b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -57,6 +57,7 @@ use utils::lsn::Lsn;
 
 pub struct WalIngest {
     shard: ShardIdentity,
+    pg_version: u32,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
     warn_ingest_lag: WarnIngestLag,
@@ -82,6 +83,7 @@ impl WalIngest {
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
+            pg_version: timeline.pg_version,
             checkpoint,
             checkpoint_modified: false,
             warn_ingest_lag: WarnIngestLag {
@@ -104,10 +106,9 @@ impl WalIngest {
     ///
     pub async fn ingest_record(
         &mut self,
-        recdata: Bytes,
+        decoded: DecodedWALRecord,
         lsn: Lsn,
         modification: &mut DatadirModification<'_>,
-        decoded: &mut DecodedWALRecord,
         ctx: &RequestContext,
     ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
@@ -115,7 +116,12 @@ impl WalIngest {
         let prev_len = modification.len();
 
         modification.set_lsn(lsn)?;
-        decode_wal_record(recdata, decoded, pg_version)?;
+
+        if decoded.is_dbase_create_copy(self.pg_version) {
+            // Records of this type should always be preceded by a commit(), as they
+            // rely on reading data pages back from the Timeline.
+            assert!(!modification.has_dirty_data_pages());
+        }
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -133,11 +139,11 @@ impl WalIngest {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 // Heap AM records need some special handling, because they modify VM pages
                 // without registering them with the standard mechanism.
-                self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
+                self.ingest_heapam_record(&mut buf, modification, &decoded, ctx)
                     .await?;
             }
             pg_constants::RM_NEON_ID => {
-                self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
+                self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx)
                     .await?;
             }
             // Handle other special record types
@@ -325,7 +331,7 @@ impl WalIngest {
             }
             pg_constants::RM_RELMAP_ID => {
                 let xlrec = XlRelmapUpdate::decode(&mut buf);
-                self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
+                self.ingest_relmap_page(modification, &xlrec, &decoded, ctx)
                     .await?;
             }
             pg_constants::RM_XLOG_ID => {
@@ -470,7 +476,7 @@ impl WalIngest {
 
                 continue;
             }
-            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
+            self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx)
                 .await?;
         }
 
@@ -486,6 +492,8 @@ impl WalIngest {
         // until commit() is called to flush the data into the repository and update
         // the latest LSN.
 
+        modification.on_record_end();
+
         Ok(modification.len() > prev_len)
     }
 
@@ -557,6 +565,7 @@ impl WalIngest {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
+
             self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                 .await?;
         } else {
@@ -1195,7 +1204,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                 // Tail of last remaining FSM page has to be zeroed.
                 // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
                 fsm_physical_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1217,7 +1226,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                 // Tail of last remaining vm page has to be zeroed.
                 // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, vm_page_no);
                 vm_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1687,7 +1696,7 @@ impl WalIngest {
                     continue;
                 }
 
-                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image_zero(rel, gap_blknum);
             }
         }
         Ok(())
@@ -1753,7 +1762,7 @@ impl WalIngest {
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum);
             }
         }
         Ok(())
@@ -1827,21 +1836,25 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
 
         assert_current_logical_size(&tline, Lsn(0x50));
@@ -1983,6 +1996,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline
@@ -2008,6 +2022,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
             .await?;
+        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline
@@ -2409,7 +2424,6 @@ mod tests {
             .await
             .unwrap();
         let mut modification = tline.begin_modification(startpoint);
-        let mut decoded = DecodedWALRecord::default();
         println!("decoding {} bytes", bytes.len() - xlogoff);
 
         // Decode and ingest wal. We process the wal in chunks because
@@ -2417,8 +2431,10 @@ mod tests {
         for chunk in bytes[xlogoff..].chunks(50) {
             decoder.feed_bytes(chunk);
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+                let mut decoded = DecodedWALRecord::default();
+                decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap();
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                    .ingest_record(decoded, lsn, &mut modification, &ctx)
                     .instrument(span.clone())
                     .await
                     .unwrap();
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index edddcefbe1..0c4d575de8 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -160,6 +160,30 @@ pub struct DecodedWALRecord {
     pub origin_id: u16,
 }
 
+impl DecodedWALRecord {
+    /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
+    /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
+    /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
+    pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
+        if self.xl_rmid == pg_constants::RM_DBASE_ID {
+            let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            match pg_version {
+                14 => {
+                    // Postgres 14 database creations are always the legacy kind
+                    info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
+                }
+                15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                _ => {
+                    panic!("Unsupported postgres version {pg_version}")
+                }
+            }
+        } else {
+            false
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
 pub struct RelFileNode {

From 1a874a3e863ac613f52eb0bbfe5e8d83bcfaba55 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 3 Sep 2024 17:31:42 +0000
Subject: [PATCH 003/142] build(deps): bump flask-cors from 4.0.1 to 5.0.0
 (#8899)

---
 poetry.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7db91e51f7..b8ef08b02d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1110,13 +1110,13 @@ dotenv = ["python-dotenv"]
 
 [[package]]
 name = "flask-cors"
-version = "4.0.1"
+version = "5.0.0"
 description = "A Flask extension adding a decorator for CORS support"
 optional = false
 python-versions = "*"
 files = [
-    {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
-    {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
+    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
+    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
 ]
 
 [package.dependencies]

From 3d9001d83ff54e8bd6a297c3328408323c4e21ff Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 4 Sep 2024 02:05:06 +0800
Subject: [PATCH 004/142] fix(pageserver): is_archived should be optional
 (#8902)

Set the field to optional, otherwise there will be decode errors when
newer version of the storage controller receives the JSON from older
version of the pageservers.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs | 7 ++++++-
 pageserver/src/http/routes.rs     | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1d896863df..87e8f8305a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -716,12 +716,17 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
-    pub is_archived: bool,
 
     pub walreceiver_status: String,
 
+    // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
+    // Backward compatibility: you will get a JSON not containing the newly-added field.
+    // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
+    // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
+    // read.
     /// The last aux file policy being used on this timeline
     pub last_aux_file_policy: Option<AuxFilePolicy>,
+    pub is_archived: Option<bool>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8cf2c99c09..90ae6c5557 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -468,7 +468,7 @@ async fn build_timeline_info_common(
         pg_version: timeline.pg_version,
 
         state,
-        is_archived,
+        is_archived: Some(is_archived),
 
         walreceiver_status,
 

From ecfa3d9de9eec824800db55f5e9592fe0502c96e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 4 Sep 2024 05:39:56 +0800
Subject: [PATCH 005/142] fix(storage-scrubber): wrong trial condition (#8905)

ref https://github.com/neondatabase/neon/issues/8872

## Summary of changes

We saw stuck storage scrubber in staging caused by infinite retries. I
believe here we should use `min` instead of `max` to avoid getting
minutes or hours of retry backoff.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/lib.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 3c21d2f8cf..3f08cddf50 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -422,7 +422,7 @@ fn stream_objects_with_retries<'a>(
                     let yield_err = if err.is_permanent() {
                         true
                     } else {
-                        let backoff_time = 1 << trial.max(5);
+                        let backoff_time = 1 << trial.min(5);
                         tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                         trial += 1;
                         trial == MAX_RETRIES - 1
@@ -473,7 +473,7 @@ async fn list_objects_with_retries(
                     s3_target.delimiter,
                     DisplayErrorContext(e),
                 );
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }
@@ -492,7 +492,7 @@ async fn download_object_with_retries(
             Ok(response) => response,
             Err(e) => {
                 error!("Failed to download object for key {key}: {e}");
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                 continue;
             }
@@ -508,7 +508,7 @@ async fn download_object_with_retries(
             }
             Err(e) => {
                 error!("Failed to stream object body for key {key}: {e}");
-                let backoff_time = 1 << trial.max(5);
+                let backoff_time = 1 << trial.min(5);
                 tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }

From 75310fe441b87d399213e365f1364aa9f08aa40d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 4 Sep 2024 10:09:41 +0100
Subject: [PATCH 006/142] storcon: make hb interval an argument and speed up
 tests (#8880)

## Problem
Each test might wait for up to 5s in order to HB the pageserver.

## Summary of changes
Make the heartbeat interval configurable and use a really tight one for
neon local => startup quicker
---
 control_plane/src/local_env.rs          |  7 +++++++
 control_plane/src/storage_controller.rs |  2 ++
 storage_controller/src/main.rs          | 12 ++++++++++--
 storage_controller/src/service.rs       |  9 ++++++---
 test_runner/regress/test_tenants.py     |  4 +++-
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 74caba2b56..5dbc3bcbbc 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf {
     pub split_threshold: Option<u64>,
 
     pub max_secondary_lag_bytes: Option<u64>,
+
+    #[serde(with = "humantime_serde")]
+    pub heartbeat_interval: Duration,
 }
 
 impl NeonStorageControllerConf {
@@ -172,6 +175,9 @@ impl NeonStorageControllerConf {
     const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
 
     const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
+
+    // Very tight heartbeat interval to speed up tests
+    const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
 }
 
 impl Default for NeonStorageControllerConf {
@@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf {
             database_url: None,
             split_threshold: None,
             max_secondary_lag_bytes: None,
+            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 27d8e2de0c..c715d6b789 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -437,6 +437,8 @@ impl StorageController {
             &humantime::Duration::from(self.config.max_offline).to_string(),
             "--max-warming-up-interval",
             &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--heartbeat-interval",
+            &humantime::Duration::from(self.config.heartbeat_interval).to_string(),
             "--address-for-peers",
             &address_for_peers.to_string(),
         ]
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index e3f29b84e7..00e90f4467 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
-    Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    RECONCILER_CONCURRENCY_DEFAULT,
+    Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT,
+    MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -104,6 +104,10 @@ struct Cli {
     // a pageserver
     #[arg(long)]
     max_secondary_lag_bytes: Option<u64>,
+
+    // Period with which to send heartbeats to registered nodes
+    #[arg(long)]
+    heartbeat_interval: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> {
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
+        heartbeat_interval: args
+            .heartbeat_interval
+            .map(humantime::Duration::into)
+            .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT),
         address_for_peers: args.address_for_peers,
         start_as_candidate: args.start_as_candidate,
         http_service_port: args.listen.port() as i32,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 95821827e2..49253cb4e0 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -121,6 +121,9 @@ pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 
+/// How often to send heartbeats to registered nodes?
+pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5);
+
 #[derive(Clone, strum_macros::Display)]
 enum TenantOperations {
     Create,
@@ -326,6 +329,8 @@ pub struct Config {
     // upgraded to primary.
     pub max_secondary_lag_bytes: Option<u64>,
 
+    pub heartbeat_interval: Duration,
+
     pub address_for_peers: Option<Uri>,
 
     pub start_as_candidate: bool,
@@ -909,9 +914,7 @@ impl Service {
     async fn spawn_heartbeat_driver(&self) {
         self.startup_complete.clone().wait().await;
 
-        const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
-
-        let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
+        let mut interval = tokio::time::interval(self.config.heartbeat_interval);
         while !self.cancel.is_cancelled() {
             tokio::select! {
               _ = interval.tick() => { }
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 0ebf714de0..b63ff7f6bd 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     tenant_id: TenantId = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # Multiple creation requests which race will generate this error
+    # Multiple creation requests which race will generate this error on the pageserver
+    # and storage controller respectively
     env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
+    env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
 
     # Tenant creation requests which arrive out of order will generate complaints about
     # generation nubmers out of order.

From 7a1397cf376cc4169385f6f19c371179396ada5f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 4 Sep 2024 13:10:05 +0300
Subject: [PATCH 007/142] storcon: boilerplate to upsert safekeeper records on
 deploy (#8879)

We currently do not record safekeepers in the storage controller
database. We want to migrate timelines across safekeepers eventually, so
start recording the safekeepers on deploy.

Cc: #8698
---
 .../2024-08-23-102952_safekeepers/down.sql    |  2 +
 .../2024-08-23-102952_safekeepers/up.sql      | 15 ++++
 storage_controller/src/http.rs                | 57 ++++++++++++
 storage_controller/src/persistence.rs         | 86 +++++++++++++++++++
 storage_controller/src/schema.rs              | 14 +++
 storage_controller/src/service.rs             | 14 +++
 test_runner/fixtures/neon_fixtures.py         | 23 +++++
 .../regress/test_storage_controller.py        | 68 ++++++++++++++-
 8 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
 create mode 100644 storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql

diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
new file mode 100644
index 0000000000..9dfc750586
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql
@@ -0,0 +1,2 @@
+-- This file should undo anything in `up.sql`
+DROP TABLE safekeepers;
diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql
new file mode 100644
index 0000000000..c78716660f
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql
@@ -0,0 +1,15 @@
+-- started out as a copy of cplane schema, removed the unnecessary columns.
+CREATE TABLE safekeepers (
+	-- the surrogate identifier defined by control plane database sequence
+	id BIGINT PRIMARY KEY,
+	region_id TEXT NOT NULL,
+	version BIGINT NOT NULL,
+	-- the natural id on whatever cloud platform, not needed in storage controller
+	-- instance_id TEXT UNIQUE NOT NULL,
+	host TEXT NOT NULL,
+	port INTEGER NOT NULL,
+	active BOOLEAN NOT NULL DEFAULT false,
+	-- projects_count INTEGER NOT NULL DEFAULT 0,
+	http_port INTEGER NOT NULL,
+	availability_zone_id TEXT NOT NULL
+);
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index d3eb081be4..0fa4f4fd0e 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -2,6 +2,7 @@ use crate::metrics::{
     HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
     METRICS_REGISTRY,
 };
+use crate::persistence::SafekeeperPersistence;
 use crate::reconciler::ReconcileError;
 use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
@@ -767,6 +768,55 @@ impl From<ReconcileError> for ApiError {
     }
 }
 
+/// Return the safekeeper record by instance id, or 404.
+///
+/// Not used by anything except manual testing.
+async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    let state = get_state(&req);
+
+    let res = state.service.get_safekeeper(id).await;
+
+    match res {
+        Ok(b) => json_response(StatusCode::OK, b),
+        Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
+            Err(ApiError::NotFound("unknown instance_id".into()))
+        }
+        Err(other) => Err(other.into()),
+    }
+}
+
+/// Used as part of deployment scripts.
+///
+/// Assumes information is only relayed to storage controller after first selecting an unique id on
+/// control plane database, which means we have an id field in the request and payload.
+async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let body = json_request::<SafekeeperPersistence>(&mut req).await?;
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    if id != body.id {
+        // it should be repeated
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "id mismatch: url={id:?}, body={:?}",
+            body.id
+        )));
+    }
+
+    let state = get_state(&req);
+
+    state.service.upsert_safekeeper(body).await?;
+
+    Ok(Response::builder()
+        .status(StatusCode::NO_CONTENT)
+        .body(Body::empty())
+        .unwrap())
+}
+
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
 async fn tenant_service_handler<R, H>(
@@ -1127,6 +1177,13 @@ pub fn make_router(
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 6e1c2016ff..d03eb87242 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -938,6 +938,48 @@ impl Persistence {
 
         Ok(())
     }
+
+    pub(crate) async fn safekeeper_get(
+        &self,
+        id: i64,
+    ) -> Result<SafekeeperPersistence, DatabaseError> {
+        use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
+        self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
+            Ok(safekeepers
+                .filter(id_column.eq(&id))
+                .select(SafekeeperPersistence::as_select())
+                .get_result(conn)?)
+        })
+        .await
+    }
+
+    pub(crate) async fn safekeeper_upsert(
+        &self,
+        record: SafekeeperPersistence,
+    ) -> Result<(), DatabaseError> {
+        use crate::schema::safekeepers::dsl::*;
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            let bind = record.as_insert_or_update();
+
+            let inserted_updated = diesel::insert_into(safekeepers)
+                .values(&bind)
+                .on_conflict(id)
+                .do_update()
+                .set(&bind)
+                .execute(conn)?;
+
+            if inserted_updated != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({})",
+                    inserted_updated
+                )));
+            }
+
+            Ok(())
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -1073,3 +1115,47 @@ pub(crate) struct ControllerPersistence {
     pub(crate) address: String,
     pub(crate) started_at: chrono::DateTime<chrono::Utc>,
 }
+
+#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)]
+#[diesel(table_name = crate::schema::safekeepers)]
+pub(crate) struct SafekeeperPersistence {
+    pub(crate) id: i64,
+    pub(crate) region_id: String,
+    /// 1 is special, it means just created (not currently posted to storcon).
+    /// Zero or negative is not really expected.
+    /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
+    pub(crate) version: i64,
+    pub(crate) host: String,
+    pub(crate) port: i32,
+    pub(crate) active: bool,
+    pub(crate) http_port: i32,
+    pub(crate) availability_zone_id: String,
+}
+
+impl SafekeeperPersistence {
+    fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> {
+        InsertUpdateSafekeeper {
+            id: self.id,
+            region_id: &self.region_id,
+            version: self.version,
+            host: &self.host,
+            port: self.port,
+            active: self.active,
+            http_port: self.http_port,
+            availability_zone_id: &self.availability_zone_id,
+        }
+    }
+}
+
+#[derive(Insertable, AsChangeset)]
+#[diesel(table_name = crate::schema::safekeepers)]
+struct InsertUpdateSafekeeper<'a> {
+    id: i64,
+    region_id: &'a str,
+    version: i64,
+    host: &'a str,
+    port: i32,
+    active: bool,
+    http_port: i32,
+    availability_zone_id: &'a str,
+}
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 1e8379500c..e0f515daea 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -45,3 +45,17 @@ diesel::table! {
 }
 
 diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
+
+diesel::table! {
+    safekeepers {
+        id -> Int8,
+        region_id -> Text,
+        version -> Int8,
+        instance_id -> Text,
+        host -> Text,
+        port -> Int4,
+        active -> Bool,
+        http_port -> Int4,
+        availability_zone_id -> Text,
+    }
+}
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 49253cb4e0..4ccc5c951c 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6476,4 +6476,18 @@ impl Service {
 
         global_observed
     }
+
+    pub(crate) async fn get_safekeeper(
+        &self,
+        id: i64,
+    ) -> Result<crate::persistence::SafekeeperPersistence, DatabaseError> {
+        self.persistence.safekeeper_get(id).await
+    }
+
+    pub(crate) async fn upsert_safekeeper(
+        &self,
+        record: crate::persistence::SafekeeperPersistence,
+    ) -> Result<(), DatabaseError> {
+        self.persistence.safekeeper_upsert(record).await
+    }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8c99408cfb..890538b86a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2845,6 +2845,29 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         raise AssertionError("unreachable")
 
+    def on_safekeeper_deploy(self, id: int, body: dict[str, Any]):
+        self.request(
+            "POST",
+            f"{self.api}/control/v1/safekeeper/{id}",
+            headers=self.headers(TokenScope.ADMIN),
+            json=body,
+        )
+
+    def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]:
+        try:
+            response = self.request(
+                "GET",
+                f"{self.api}/control/v1/safekeeper/{id}",
+                headers=self.headers(TokenScope.ADMIN),
+            )
+            json = response.json()
+            assert isinstance(json, dict)
+            return json
+        except StorageControllerApiException as e:
+            if e.status_code == 404:
+                return None
+            raise e
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 03eb7628be..13f5ec1b4f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -31,7 +31,7 @@ from fixtures.pageserver.utils import (
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, run_only_on_default_postgres
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.storage_controller_proxy import StorageControllerProxy
@@ -2330,3 +2330,69 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
             connect=0,  # Disable retries: we want to see the 503
         )
     ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    fake_id = 5
+
+    target = env.storage_controller
+
+    assert target.get_safekeeper(fake_id) is None
+
+    body = {
+        "active": True,
+        "id": fake_id,
+        "created_at": "2023-10-25T09:11:25Z",
+        "updated_at": "2024-08-28T11:32:43Z",
+        "region_id": "aws-us-east-2",
+        "host": "safekeeper-333.us-east-2.aws.neon.build",
+        "port": 6401,
+        "http_port": 7676,
+        "version": 5957,
+        "availability_zone_id": "us-east-2b",
+    }
+
+    target.on_safekeeper_deploy(fake_id, body)
+
+    inserted = target.get_safekeeper(fake_id)
+    assert inserted is not None
+    assert eq_safekeeper_records(body, inserted)
+
+    # error out if pk is changed (unexpected)
+    with pytest.raises(StorageControllerApiException) as exc:
+        different_pk = dict(body)
+        different_pk["id"] = 4
+        assert different_pk["id"] != body["id"]
+        target.on_safekeeper_deploy(fake_id, different_pk)
+    assert exc.value.status_code == 400
+
+    inserted_again = target.get_safekeeper(fake_id)
+    assert inserted_again is not None
+    assert eq_safekeeper_records(inserted, inserted_again)
+
+    # the most common case, version goes up:
+    assert isinstance(body["version"], int)
+    body["version"] += 1
+    target.on_safekeeper_deploy(fake_id, body)
+    inserted_now = target.get_safekeeper(fake_id)
+    assert inserted_now is not None
+
+    assert eq_safekeeper_records(body, inserted_now)
+
+
+def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
+    compared = [dict(a), dict(b)]
+
+    masked_keys = ["created_at", "updated_at"]
+
+    for d in compared:
+        # keep deleting these in case we are comparing the body as it will be uploaded by real scripts
+        for key in masked_keys:
+            if key in d:
+                del d[key]
+
+    return compared[0] == compared[1]

From a046717a2409b5291ad341c1f4d26cb1df1a55bd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Sep 2024 14:41:51 +0300
Subject: [PATCH 008/142] Fix submodule refs to point to the correct
 REL_X_STABLE_neon branches (#8910)

Commit cfa45ff5ee (PR #8860) updated the vendor/postgres submodules, but
didn't use the same commit SHAs that were pushed as the corresponding
REL_*_STABLE_neon branches in the postgres repository. The contents were
the same, but the REL_*_STABLE_neon branches pointed to squashed
versions of the commits, whereas the SHAs used in the submodules
referred to the pre-squash revisions.

Note: The vendor/postgres-v14 submodule still doesn't match with the tip
of REL_14_STABLE_neon branch, because there has been one more commit on
that branch since then. That's another confusion which we should fix,
but let's do that separately. This commit doesn't change the code that
gets built in any way, only changes the submodule references to point to
the correct SHAs in the REL_*_STABLE_neon branch histories, rather than
some detached commits.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 48388a5b59..7602e907ab 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 48388a5b597c81c09e28c016650a7156b48717a1
+Subproject commit 7602e907ab30f16188bebfd66b8f297c2889d339
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8aa1ded772..49d5e576a5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8aa1ded7726d416ac8e02600aad387a353478fc7
+Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 95132feffe..6e9a4ff624 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 95132feffe277ce84309d93a42e9aadfd2cb0437
+Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 319e648488..751b9e8679 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,14 +1,14 @@
 {
   "v16": [
     "16.4",
-    "95132feffe277ce84309d93a42e9aadfd2cb0437"
+    "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b"
   ],
   "v15": [
     "15.8",
-    "8aa1ded7726d416ac8e02600aad387a353478fc7"
+    "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e"
   ],
   "v14": [
     "14.13",
-    "48388a5b597c81c09e28c016650a7156b48717a1"
+    "7602e907ab30f16188bebfd66b8f297c2889d339"
   ]
 }

From 3f43823a9b333140ccf21a55ff1316c351bacd58 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 4 Sep 2024 13:41:10 +0100
Subject: [PATCH 009/142] build(deps): bump cryptography from 42.0.4 to 43.0.1
 (#8908)

---
 poetry.lock | 63 ++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index b8ef08b02d..48943a73e9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -985,43 +985,38 @@ files = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.4"
+version = "43.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
-    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
-    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
-    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
-    {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
-    {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
-    {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
-    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
-    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
-    {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
-    {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
-    {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
-    {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
-    {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
+    {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
+    {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
+    {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
+    {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
+    {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
+    {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
+    {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
+    {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
+    {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
+    {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
+    {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
+    {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
+    {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
 ]
 
 [package.dependencies]
@@ -1034,7 +1029,7 @@ nox = ["nox"]
 pep8test = ["check-sdist", "click", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 
 [[package]]

From 1a9b54f1d99fb373eddc7f3ff57174031d34c7b6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 4 Sep 2024 15:00:40 +0100
Subject: [PATCH 010/142] storage controller: read from database in validate
 API (#8784)

## Problem

The initial implementation of the validate API treats the in-memory
generations as authoritative.
- This is true when only one storage controller is running, but if a
rogue controller was running that hadn't been shut down properly, and
some pageserver requests were routed to that bad controller, it could
incorrectly return valid=true for stale generations.
- The generation in the main in-memory map gets out of date while a live
migration is in flight, and if the origin location for the migration
tries to do some deletions even though it is in AttachedStale (for
example because it had already started compaction), these might be
wrongly validated + executed.

## Summary of changes

- Continue to do the in-memory check: if this returns valid=false it is
sufficient to reject requests.
- When valid=true, do an additional read from the database to confirm
the generation is fresh.
- Revise behavior for validation on missing shards: this used to always
return valid=true as a convenience for deletions and shard splits, so
that pageservers weren't prevented from completing any enqueued
deletions for these shards after they're gone. However, this becomes
unsafe when we consider split brain scenarios. We could reinstate this
in future if we wanted to store some tombstones for deleted shards.
- Update test_scrubber_physical_gc to cope with the behavioral change:
they must now explicitly flush the deletion queue before splits, to
avoid tripping up on deletions that are enqueued at the time of the
split (these tests assert "scrubber deletes nothing", which check fails
if the split leaves behind some remote objects that are legitimately
GC'able)
- Add `test_storage_controller_validate_during_migration`, which uses
failpoints to create a situation where incorrect generation validation
during a live migration could result in a corruption

The rate of validate calls for tenants is pretty low: it happens as a
consequence deletions from GC and compaction, which are both
concurrency-limited on the pageserver side.
---
 storage_controller/src/http.rs                |   2 +-
 storage_controller/src/persistence.rs         |  70 ++++++++++-
 storage_controller/src/reconciler.rs          |   3 +
 storage_controller/src/service.rs             |  91 ++++++++++----
 .../regress/test_storage_controller.py        | 116 ++++++++++++++++++
 test_runner/regress/test_storage_scrubber.py  |  11 ++
 6 files changed, 261 insertions(+), 32 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 0fa4f4fd0e..32882c201a 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -102,7 +102,7 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.validate(validate_req))
+    json_response(StatusCode::OK, state.service.validate(validate_req).await?)
 }
 
 /// Call into this before attaching a tenant to a pageserver, to acquire a generation number
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index d03eb87242..e801289752 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use itertools::Itertools;
 use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
@@ -91,7 +92,8 @@ pub(crate) enum DatabaseOperation {
     Detach,
     ReAttach,
     IncrementGeneration,
-    PeekGenerations,
+    TenantGenerations,
+    ShardGenerations,
     ListTenantShards,
     InsertTenantShards,
     UpdateTenantShard,
@@ -544,13 +546,13 @@ impl Persistence {
     /// If the tenant doesn't exist, an empty vector is returned.
     ///
     /// Output is sorted by shard number
-    pub(crate) async fn peek_generations(
+    pub(crate) async fn tenant_generations(
         &self,
         filter_tenant_id: TenantId,
     ) -> Result<Vec<ShardGenerationState>, DatabaseError> {
         use crate::schema::tenant_shards::dsl::*;
         let rows = self
-            .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
+            .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
                 let result = tenant_shards
                     .filter(tenant_id.eq(filter_tenant_id.to_string()))
                     .select(TenantShardPersistence::as_select())
@@ -572,6 +574,64 @@ impl Persistence {
             .collect())
     }
 
+    /// Read the generation number of specific tenant shards
+    ///
+    /// Output is unsorted.  Output may not include values for all inputs, if they are missing in the database.
+    pub(crate) async fn shard_generations(
+        &self,
+        mut tenant_shard_ids: impl Iterator<Item = &TenantShardId>,
+    ) -> Result<Vec<(TenantShardId, Option<Generation>)>, DatabaseError> {
+        let mut rows = Vec::with_capacity(tenant_shard_ids.size_hint().0);
+
+        // We will chunk our input to avoid composing arbitrarily long `IN` clauses.  Typically we are
+        // called with a single digit number of IDs, but in principle we could be called with tens
+        // of thousands (all the shards on one pageserver) from the generation validation API.
+        loop {
+            // A modest hardcoded chunk size to handle typical cases in a single query but never generate particularly
+            // large query strings.
+            let chunk_ids = tenant_shard_ids.by_ref().take(32);
+
+            // Compose a comma separated list of tuples for matching on (tenant_id, shard_number, shard_count)
+            let in_clause = chunk_ids
+                .map(|tsid| {
+                    format!(
+                        "('{}', {}, {})",
+                        tsid.tenant_id, tsid.shard_number.0, tsid.shard_count.0
+                    )
+                })
+                .join(",");
+
+            // We are done when our iterator gives us nothing to filter on
+            if in_clause.is_empty() {
+                break;
+            }
+
+            let chunk_rows = self
+                .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
+                    // diesel doesn't support multi-column IN queries, so we compose raw SQL.  No escaping is required because
+                    // the inputs are strongly typed and cannot carry any user-supplied raw string content.
+                    let result : Vec<TenantShardPersistence> = diesel::sql_query(
+                        format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
+                    ).load(conn)?;
+
+                    Ok(result)
+                })
+                .await?;
+            rows.extend(chunk_rows.into_iter())
+        }
+
+        Ok(rows
+            .into_iter()
+            .map(|tsp| {
+                (
+                    tsp.get_tenant_shard_id()
+                        .expect("Bad tenant ID in database"),
+                    tsp.generation.map(|g| Generation::new(g as u32)),
+                )
+            })
+            .collect())
+    }
+
     #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
@@ -983,7 +1043,9 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
+#[derive(
+    QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,
+)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 102a3124d2..83b7b2b4f2 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -17,6 +17,7 @@ use utils::failpoint_support;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
+use utils::pausable_failpoint;
 use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
@@ -593,6 +594,8 @@ impl Reconciler {
             notify_attempts += 1;
         }
 
+        pausable_failpoint!("reconciler-live-migrate-post-notify");
+
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
         // this location will be deleted in the general case reconciliation that runs after this.
         let origin_secondary_conf = build_location_config(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4ccc5c951c..90334d10a7 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1854,37 +1854,74 @@ impl Service {
         Ok(response)
     }
 
-    pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse {
-        let locked = self.inner.read().unwrap();
+    pub(crate) async fn validate(
+        &self,
+        validate_req: ValidateRequest,
+    ) -> Result<ValidateResponse, DatabaseError> {
+        // Fast in-memory check: we may reject validation on anything that doesn't match our
+        // in-memory generation for a shard
+        let in_memory_result = {
+            let mut in_memory_result = Vec::new();
+            let locked = self.inner.read().unwrap();
+            for req_tenant in validate_req.tenants {
+                if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
+                    let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
+                    tracing::info!(
+                        "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
+                        req_tenant.id,
+                        req_tenant.gen,
+                        tenant_shard.generation
+                    );
+
+                    in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid));
+                } else {
+                    // This is legal: for example during a shard split the pageserver may still
+                    // have deletions in its queue from the old pre-split shard, or after deletion
+                    // of a tenant that was busy with compaction/gc while being deleted.
+                    tracing::info!(
+                        "Refusing deletion validation for missing shard {}",
+                        req_tenant.id
+                    );
+                }
+            }
+
+            in_memory_result
+        };
+
+        // Database calls to confirm validity for anything that passed the in-memory check.  We must do this
+        // in case of controller split-brain, where some other controller process might have incremented the generation.
+        let db_generations = self
+            .persistence
+            .shard_generations(in_memory_result.iter().filter_map(|i| {
+                if i.2 {
+                    Some(&i.0)
+                } else {
+                    None
+                }
+            }))
+            .await?;
+        let db_generations = db_generations.into_iter().collect::<HashMap<_, _>>();
 
         let mut response = ValidateResponse {
             tenants: Vec::new(),
         };
-
-        for req_tenant in validate_req.tenants {
-            if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
-                tracing::info!(
-                    "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
-                    req_tenant.id,
-                    req_tenant.gen,
-                    tenant_shard.generation
-                );
-                response.tenants.push(ValidateResponseTenant {
-                    id: req_tenant.id,
-                    valid,
-                });
+        for (tenant_shard_id, validate_generation, valid) in in_memory_result.into_iter() {
+            let valid = if valid {
+                let db_generation = db_generations.get(&tenant_shard_id);
+                db_generation == Some(&Some(validate_generation))
             } else {
-                // After tenant deletion, we may approve any validation.  This avoids
-                // spurious warnings on the pageserver if it has pending LSN updates
-                // at the point a deletion happens.
-                response.tenants.push(ValidateResponseTenant {
-                    id: req_tenant.id,
-                    valid: true,
-                });
-            }
+                // If in-memory state says it's invalid, trust that.  It's always safe to fail a validation, at worst
+                // this prevents a pageserver from cleaning up an object in S3.
+                false
+            };
+
+            response.tenants.push(ValidateResponseTenant {
+                id: tenant_shard_id,
+                valid,
+            })
         }
-        response
+
+        Ok(response)
     }
 
     pub(crate) async fn tenant_create(
@@ -3179,7 +3216,7 @@ impl Service {
             // run concurrently with reconciliations, and it is not guaranteed that the node we find here
             // will still be the latest when we're done: we will check generations again at the end of
             // this function to handle that.
-            let generations = self.persistence.peek_generations(tenant_id).await?;
+            let generations = self.persistence.tenant_generations(tenant_id).await?;
 
             if generations
                 .iter()
@@ -3236,7 +3273,7 @@ impl Service {
         // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
         // our remote operation executed on the latest generation and is therefore persistent.
         {
-            let latest_generations = self.persistence.peek_generations(tenant_id).await?;
+            let latest_generations = self.persistence.tenant_generations(tenant_id).await?;
             if latest_generations
                 .into_iter()
                 .map(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 13f5ec1b4f..8da42294b0 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2332,6 +2332,122 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
     ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
 
 
+def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvBuilder):
+    """
+    A correctness edge case: while we are live migrating and a shard's generation is
+    visible to the Reconciler but not to the central Service, the generation validation
+    API should still prevent stale generations from doing deletions.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+    }
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    env.neon_cli.create_tenant(tenant_id, timeline_id)
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF)
+
+    # Write enough data that a compaction would do some work (deleting some L0s)
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(64)
+    for _i in range(0, 2):
+        workload.churn_rows(64, upload=False)
+
+    # Upload but don't compact
+    origin_pageserver = env.get_tenant_pageserver(tenant_id)
+    dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
+    origin_pageserver.http_client().timeline_checkpoint(
+        tenant_id, timeline_id, wait_until_uploaded=True, compact=False
+    )
+
+    # Start a compaction that will pause on a failpoint.
+    compaction_failpoint = "before-upload-index-pausable"
+    origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "pause"))
+
+    # This failpoint can also cause migration code to time out trying to politely flush
+    # during migrations
+    origin_pageserver.allowed_errors.append(".*Timed out waiting for flush to remote storage.*")
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            compact_fut = executor.submit(
+                origin_pageserver.http_client().timeline_compact,
+                tenant_id,
+                timeline_id,
+                wait_until_uploaded=True,
+            )
+
+            # Let the compaction start and then get stuck uploading an index: when we live migrate, the new generation's
+            # index will be initialized from the pre-compaction index, referencing layers that the compaction will try to delete
+            def has_hit_compaction_failpoint():
+                assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
+
+            wait_until(10, 1, has_hit_compaction_failpoint)
+
+            # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
+            # after incrementing generation and attaching the new location
+            migration_failpoint = "reconciler-live-migrate-post-notify"
+            env.storage_controller.configure_failpoints((migration_failpoint, "pause"))
+            migrate_fut = executor.submit(
+                env.storage_controller.tenant_shard_migrate,
+                TenantShardId(tenant_id, 0, 0),
+                dest_ps_id,
+            )
+
+            def has_hit_migration_failpoint():
+                assert env.storage_controller.log_contains(f"at failpoint {migration_failpoint}")
+
+            # Long wait because the migration will have to time out during transition to AttachedStale
+            # before it reaches this point.  The timeout is because the AttachedStale transition includes
+            # a flush of remote storage, and if the compaction already enqueued an index upload this cannot
+            # make progress.
+            wait_until(60, 1, has_hit_migration_failpoint)
+
+            # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
+            origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
+            compact_fut.result()
+            origin_pageserver.http_client().deletion_queue_flush(execute=True)
+
+            # Eventually migration completes
+            env.storage_controller.configure_failpoints((migration_failpoint, "off"))
+            migrate_fut.result()
+    except:
+        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
+        env.storage_controller.configure_failpoints((migration_failpoint, "off"))
+        origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
+        raise
+
+    # Ensure the destination of the migration writes an index, so that if it has corrupt state that is
+    # visible to the scrubber.
+    workload.write_rows(1, upload=False)
+    env.get_pageserver(dest_ps_id).http_client().timeline_checkpoint(
+        tenant_id, timeline_id, wait_until_uploaded=True, compact=False
+    )
+
+    # The destination of the live migration would now have a corrupt index (referencing deleted L0s) if
+    # the controller had not properly applied validation rules.
+    healthy, _summary = env.storage_scrubber.scan_metadata()
+    try:
+        log.info(f"scrubbed, healthy={healthy}")
+        assert healthy
+    except:
+        # On failures, we want to report them FAIL during the test, not as ERROR during teardown
+        neon_env_builder.enable_scrub_on_exit = False
+        raise
+
+
 @run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 292a9a1010..848e214c5e 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -217,6 +217,13 @@ def test_scrubber_physical_gc_ancestors(
     workload.init()
     workload.write_rows(100)
 
+    # Issue a deletion queue flush so that the parent shard can't leave behind layers
+    # that will look like unexpected garbage to the scrubber
+    for pre_split_shard in env.storage_controller.locate(tenant_id):
+        env.get_pageserver(pre_split_shard["node_id"]).http_client().deletion_queue_flush(
+            execute=True
+        )
+
     new_shard_count = 4
     assert shard_count is None or new_shard_count > shard_count
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
@@ -321,6 +328,10 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
     workload.write_rows(100, upload=False)
     workload.stop()
 
+    # Issue a deletion queue flush so that the parent shard can't leave behind layers
+    # that will look like unexpected garbage to the scrubber
+    env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True)
+
     new_shard_count = 4
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
     for shard in shards:

From 0205ce184967f4510b6034bf2051a495bf464b44 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Sep 2024 17:41:51 +0300
Subject: [PATCH 011/142] Update submodule reference for vendor/postgres-v14
 (#8913)

There was a confusion on the REL_14_STABLE_neon branch. PR
https://github.com/neondatabase/postgres/pull/471 was merged ot the
branch, but the corresponding PRs on the other REL_15_STABLE_neon and
REL_16_STABLE_neon branches were not merged. Also, the submodule
reference in the neon repository was never updated, so even though the
REL_14_STABLE_neon branch contained the commit, it was never used.

That PR https://github.com/neondatabase/postgres/pull/471 was a few
bricks shy of a load (no tests, some differences between the different
branches), so to get us to a good state, revert that change from the
REL_14_STABLE_neon branch. This PR in the neon repository updates the
submodule reference past two commites on the REL_14_STABLE_neon branch:
first the commit from PR
https://github.com/neondatabase/postgres/pull/471, and immediately after
that the revert of the same commit. This brings us back to square one,
but now the submodule reference matches the tip of the
REL_14_STABLE_neon branch again.
---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 7602e907ab..a317b9b5b9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 7602e907ab30f16188bebfd66b8f297c2889d339
+Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 751b9e8679..e52576e61f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -9,6 +9,6 @@
   ],
   "v14": [
     "14.13",
-    "7602e907ab30f16188bebfd66b8f297c2889d339"
+    "a317b9b5b96978b49e78986697f3dd80d06f99a7"
   ]
 }

From 99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 5 Sep 2024 04:45:04 +0800
Subject: [PATCH 012/142] fix(pageserver): more information on aux v1 warnings
 (#8906)

Part of https://github.com/neondatabase/neon/issues/8623

## Summary of changes

It seems that we have tenants with aux policy set to v1 but don't have
any aux files in the storage. It is still safe to force migrate them
without notifying the customers. This patch adds more details to the
warning to identify the cases where we have to reach out to the users
before retiring aux v1.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 10 +++++++---
 pageserver/src/tenant/timeline.rs   |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c26abca1f7..d28a214265 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -729,8 +729,12 @@ impl Timeline {
         let current_policy = self.last_aux_file_policy.load();
         match current_policy {
             Some(AuxFilePolicy::V1) => {
-                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
-                self.list_aux_files_v1(lsn, ctx).await
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                let empty_str = if res.is_empty() { ", empty" } else { "" };
+                warn!(
+                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
+                );
+                Ok(res)
             }
             None => {
                 let res = self.list_aux_files_v1(lsn, ctx).await?;
@@ -1657,7 +1661,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
-                    warn!("this timeline is using deprecated aux file policy V1");
+                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
                     self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6eadf9a564..3b8f19a6c0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2243,7 +2243,7 @@ impl Timeline {
             };
 
             if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1");
+                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
             }
 
             result.repartition_threshold =

From 708322ce3c0d55bcee5ee9e3632ecfb8c37415f5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 5 Sep 2024 09:56:26 +0100
Subject: [PATCH 013/142] storcon: handle fills including high tput tenants
 more gracefully (#8865)

## Problem
A tenant may ingest a lot of data between being drained for node restart
and being moved back
in the fill phase. This is expensive and causes the fill to stall.

## Summary of changes
We make a tactical change to reduce secondary warm-up time for
migrations in fills.
---
 storage_controller/src/service.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 90334d10a7..ca416095bb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6297,9 +6297,13 @@ impl Service {
         node_id: NodeId,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
-        // TODO(vlad): Currently this operates on the assumption that all
-        // secondaries are warm. This is not always true (e.g. we just migrated the
-        // tenant). Take that into consideration by checking the secondary status.
+        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
+        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
+        let reconciler_config = ReconcilerConfigBuilder::new()
+            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
+            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
+            .build();
+
         let mut tids_to_promote = self.fill_node_plan(node_id);
         let mut waiters = Vec::new();
 
@@ -6367,9 +6371,11 @@ impl Service {
                                         node_id
                                     );
 
-                                    if let Some(waiter) =
-                                        self.maybe_reconcile_shard(tenant_shard, nodes)
-                                    {
+                                    if let Some(waiter) = self.maybe_configured_reconcile_shard(
+                                        tenant_shard,
+                                        nodes,
+                                        reconciler_config,
+                                    ) {
                                         waiters.push(waiter);
                                     }
                                 }

From 6dfbf49128c4392464d6832ccc2e6bdc390b0b37 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 5 Sep 2024 13:34:27 +0200
Subject: [PATCH 014/142] proxy: don't let one timeout eat entire retry budget
 (#8924)

This reduces the per-request timeout to 10sec while keeping the total
retry duration at 1min.

Relates: neondatabase/cloud#15944
---
 proxy/src/http.rs          | 9 ++++++---
 proxy/src/usage_metrics.rs | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index fee634f67f..c77d95f47d 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -35,14 +35,17 @@ pub fn new_client() -> ClientWithMiddleware {
         .build()
 }
 
-pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+pub(crate) fn new_client_with_timeout(
+    request_timeout: Duration,
+    total_retry_duration: Duration,
+) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
-        .timeout(default_timout)
+        .timeout(request_timeout)
         .build()
         .expect("Failed to create http client with timeout");
 
     let retry_policy =
-        ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
+        ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration);
 
     reqwest_middleware::ClientBuilder::new(timeout_client)
         .with(reqwest_tracing::TracingMiddleware::default())
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index aa8c7ba319..fd8599bcb3 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -33,7 +33,8 @@ use uuid::{NoContext, Timestamp};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
-const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60);
 
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
@@ -223,7 +224,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
         info!("metrics collector has shut down");
     }
 
-    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
+    let http_client = http::new_client_with_timeout(
+        HTTP_REPORTING_REQUEST_TIMEOUT,
+        HTTP_REPORTING_RETRY_DURATION,
+    );
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
     let mut prev = Utc::now();

From 850421ec06dae634b762af0d4a38194eba103884 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 5 Sep 2024 14:59:49 +0200
Subject: [PATCH 015/142] refactor(pageserver): rely on serde derive for toml
 deserialization (#7656)

This PR simplifies the pageserver configuration parsing as follows:

* introduce the `pageserver_api::config::ConfigToml` type
* implement `Default` for `ConfigToml`
* use serde derive to do the brain-dead leg-work of processing the toml
document
  * use `serde(default)` to fill in default values
* in `pageserver` crate:
* use `toml_edit` to deserialize the pageserver.toml string into a
`ConfigToml`
  * `PageServerConfig::parse_and_validate` then
    * consumes the `ConfigToml`
    * destructures it exhaustively into its constituent fields
    * constructs the `PageServerConfig`

The rules are:

* in `ConfigToml`, use `deny_unknown_fields` everywhere
* static default values go in `pageserver_api`
* if there cannot be a static default value (e.g. which default IO
engine to use, because it depends on the runtime), make the field in
`ConfigToml` an `Option`
* if runtime-augmentation of a value is needed, do that in
`parse_and_validate`
* a good example is `virtual_file_io_engine` or `l0_flush`, both of
which need to execute code to determine the effective value in
`PageServerConf`

The benefits:

* massive amount of brain-dead repetitive code can be deleted
* "unused variable" compile-time errors when removing a config value,
due to the exhaustive destructuring in `parse_and_validate`
* compile-time errors guide you when adding a new config field

Drawbacks:

* serde derive is sometimes a bit too magical
* `deny_unknown_fields` is easy to miss

Future Work / Benefits:
* make `neon_local` use `pageserver_api` to construct `ConfigToml` and
write it to `pageserver.toml`
* This provides more type safety / coompile-time errors than the current
approach.

### Refs

Fixes #3682

### Future Work

* `remote_storage` deser doesn't reject unknown fields
https://github.com/neondatabase/neon/issues/8915
* clean up `libs/pageserver_api/src/config.rs` further
  * break up into multiple files, at least for tenant config
* move `models` as appropriate / refine distinction between config and
API models / be explicit about when it's the same
  * use `pub(crate)` visibility on `mod defaults` to detect stale values
---
 Cargo.lock                                    |   13 +
 Cargo.toml                                    |    1 +
 libs/pageserver_api/Cargo.toml                |   10 +
 libs/pageserver_api/src/config.rs             |  527 +++++-
 libs/pageserver_api/src/models.rs             |   71 +-
 libs/remote_storage/src/config.rs             |   25 +
 libs/utils/src/logging.rs                     |   12 +-
 pageserver/Cargo.toml                         |    3 +-
 pageserver/benches/bench_ingest.rs            |    4 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |    3 +-
 pageserver/ctl/src/layers.rs                  |    3 +-
 pageserver/ctl/src/main.rs                    |    3 +-
 pageserver/src/bin/pageserver.rs              |   31 +-
 pageserver/src/config.rs                      | 1539 +++--------------
 pageserver/src/disk_usage_eviction_task.rs    |   48 +-
 pageserver/src/http/routes.rs                 |    4 +-
 pageserver/src/l0_flush.rs                    |   14 +-
 pageserver/src/statvfs.rs                     |   28 +-
 pageserver/src/tenant/config.rs               |  196 +--
 .../src/tenant/storage_layer/delta_layer.rs   |    3 +-
 .../src/tenant/storage_layer/image_layer.rs   |    4 +-
 .../tenant/storage_layer/inmemory_layer.rs    |    2 +-
 pageserver/src/tenant/tasks.rs                |    9 +-
 pageserver/src/tenant/timeline.rs             |    2 +-
 pageserver/src/tenant/timeline/compaction.rs  |   42 +-
 pageserver/src/tenant/vectored_blob_io.rs     |    4 -
 pageserver/src/virtual_file.rs                |    2 +-
 pageserver/src/virtual_file/io_engine.rs      |   11 +-
 test_runner/fixtures/neon_fixtures.py         |   22 +-
 .../regress/test_pageserver_generations.py    |   15 +-
 test_runner/regress/test_timeline_size.py     |    6 +-
 31 files changed, 1001 insertions(+), 1656 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5af3ef3804..91917d5351 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2727,6 +2727,12 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
+[[package]]
+name = "indoc"
+version = "2.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
+
 [[package]]
 name = "infer"
 version = "0.2.3"
@@ -3701,6 +3707,7 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
+ "indoc",
  "itertools 0.10.5",
  "md5",
  "metrics",
@@ -3766,6 +3773,7 @@ dependencies = [
  "bincode",
  "byteorder",
  "bytes",
+ "camino",
  "chrono",
  "const_format",
  "enum-map",
@@ -3773,11 +3781,16 @@ dependencies = [
  "humantime",
  "humantime-serde",
  "itertools 0.10.5",
+ "nix 0.27.1",
+ "postgres_backend",
  "postgres_ffi",
  "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "serde_with",
+ "storage_broker",
  "strum",
  "strum_macros",
  "thiserror",
diff --git a/Cargo.toml b/Cargo.toml
index fa949f9757..4fea3e8d80 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -103,6 +103,7 @@ humantime-serde = "1.1.1"
 hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
+indoc = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index cb28359ac3..8710904cec 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+# See pageserver/Cargo.toml
+testing = ["dep:nix"]
+
 [dependencies]
 serde.workspace = true
 serde_with.workspace = true
@@ -23,6 +27,12 @@ thiserror.workspace = true
 humantime-serde.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
+storage_broker.workspace = true
+camino = {workspace = true, features = ["serde1"]}
+remote_storage.workspace = true
+postgres_backend.workspace = true
+nix = {workspace = true, optional = true}
+reqwest.workspace = true
 
 [dev-dependencies]
 bincode.workspace = true
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index d996a62349..b2662c562a 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -1,15 +1,28 @@
-use std::collections::HashMap;
-
-use const_format::formatcp;
+use camino::Utf8PathBuf;
 
 #[cfg(test)]
 mod tests;
 
+use const_format::formatcp;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
 
+use postgres_backend::AuthType;
+use remote_storage::RemoteStorageConfig;
+use serde_with::serde_as;
+use std::{
+    collections::HashMap,
+    num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
+    time::Duration,
+};
+use utils::logging::LogFormat;
+
+use crate::models::ImageCompressionAlgorithm;
+use crate::models::LsnLease;
+
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
 // itself, it is only used for registering the pageserver with the control
@@ -29,3 +42,511 @@ pub struct NodeMetadata {
     #[serde(flatten)]
     pub other: HashMap<String, serde_json::Value>,
 }
+
+/// `pageserver.toml`
+///
+/// We use serde derive with `#[serde(default)]` to generate a deserializer
+/// that fills in the default values for each config field.
+///
+/// If there cannot be a static default value because we need to make runtime
+/// checks to determine the default, make it an `Option` (which defaults to None).
+/// The runtime check should be done in the consuming crate, i.e., `pageserver`.
+#[serde_as]
+#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct ConfigToml {
+    // types mapped 1:1 into the runtime PageServerConfig type
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,
+    pub availability_zone: Option<String>,
+    #[serde(with = "humantime_serde")]
+    pub wait_lsn_timeout: Duration,
+    #[serde(with = "humantime_serde")]
+    pub wal_redo_timeout: Duration,
+    pub superuser: String,
+    pub page_cache_size: usize,
+    pub max_file_descriptors: usize,
+    pub pg_distrib_dir: Option<Utf8PathBuf>,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub http_auth_type: AuthType,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub pg_auth_type: AuthType,
+    pub auth_validation_public_key_path: Option<Utf8PathBuf>,
+    pub remote_storage: Option<RemoteStorageConfig>,
+    pub tenant_config: TenantConfigToml,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub broker_endpoint: storage_broker::Uri,
+    #[serde(with = "humantime_serde")]
+    pub broker_keepalive_interval: Duration,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub log_format: LogFormat,
+    pub concurrent_tenant_warmup: NonZeroUsize,
+    pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
+    #[serde(with = "humantime_serde")]
+    pub metric_collection_interval: Duration,
+    pub metric_collection_endpoint: Option<reqwest::Url>,
+    pub metric_collection_bucket: Option<RemoteStorageConfig>,
+    #[serde(with = "humantime_serde")]
+    pub synthetic_size_calculation_interval: Duration,
+    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+    pub test_remote_failures: u64,
+    pub ondemand_download_behavior_treat_error_as_warn: bool,
+    #[serde(with = "humantime_serde")]
+    pub background_task_maximum_delay: Duration,
+    pub control_plane_api: Option<reqwest::Url>,
+    pub control_plane_api_token: Option<String>,
+    pub control_plane_emergency_mode: bool,
+    pub heatmap_upload_concurrency: usize,
+    pub secondary_download_concurrency: usize,
+    pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
+    pub ingest_batch_size: u64,
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+    pub image_compression: ImageCompressionAlgorithm,
+    pub ephemeral_bytes_per_memory_kb: usize,
+    pub l0_flush: Option<crate::models::L0FlushConfig>,
+    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
+    pub io_buffer_alignment: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct DiskUsageEvictionTaskConfig {
+    pub max_usage_pct: utils::serde_percent::Percent,
+    pub min_avail_bytes: u64,
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[cfg(feature = "testing")]
+    pub mock_statvfs: Option<statvfs::mock::Behavior>,
+    /// Select sorting for evicted layers
+    #[serde(default)]
+    pub eviction_order: EvictionOrder,
+}
+
+pub mod statvfs {
+    pub mod mock {
+        #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+        #[serde(tag = "type")]
+        pub enum Behavior {
+            Success {
+                blocksize: u64,
+                total_blocks: u64,
+                name_filter: Option<utils::serde_regex::Regex>,
+            },
+            #[cfg(feature = "testing")]
+            Failure { mocked_error: MockedError },
+        }
+
+        #[cfg(feature = "testing")]
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+        #[allow(clippy::upper_case_acronyms)]
+        pub enum MockedError {
+            EIO,
+        }
+
+        #[cfg(feature = "testing")]
+        impl From<MockedError> for nix::Error {
+            fn from(e: MockedError) -> Self {
+                match e {
+                    MockedError::EIO => nix::Error::EIO,
+                }
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "type", content = "args")]
+pub enum EvictionOrder {
+    RelativeAccessed {
+        highest_layer_count_loses_first: bool,
+    },
+}
+
+impl Default for EvictionOrder {
+    fn default() -> Self {
+        Self::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        }
+    }
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(transparent)]
+pub struct MaxVectoredReadBytes(pub NonZeroUsize);
+
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum CompactL0Phase1ValueAccess {
+    /// The old way.
+    PageCachedBlobIo,
+    /// The new way.
+    StreamingKmerge {
+        /// If set, we run both the old way and the new way, validate that
+        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
+        /// and if the validation fails,
+        /// - in tests: fail them with a panic or
+        /// - in prod, log a rate-limited warning and use the old way's results.
+        ///
+        /// If not set, we only run the new way and trust its results.
+        validate: Option<CompactL0BypassPageCacheValidation>,
+    },
+}
+
+/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum CompactL0BypassPageCacheValidation {
+    /// Validate that the series of (key, lsn) pairs are the same.
+    KeyLsn,
+    /// Validate that the entire output of old and new way is identical.
+    KeyLsnValue,
+}
+
+impl Default for CompactL0Phase1ValueAccess {
+    fn default() -> Self {
+        CompactL0Phase1ValueAccess::StreamingKmerge {
+            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
+            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
+        }
+    }
+}
+
+/// A tenant's calcuated configuration, which is the result of merging a
+/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
+///
+/// For storing and transmitting individual tenant's configuration, see
+/// TenantConfOpt.
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields, default)]
+pub struct TenantConfigToml {
+    // Flush out an inmemory layer, if it's holding WAL older than this
+    // This puts a backstop on how much WAL needs to be re-digested if the
+    // page server crashes.
+    // This parameter actually determines L0 layer file size.
+    pub checkpoint_distance: u64,
+    // Inmemory layer is also flushed at least once in checkpoint_timeout to
+    // eventually upload WAL after activity is stopped.
+    #[serde(with = "humantime_serde")]
+    pub checkpoint_timeout: Duration,
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub compaction_target_size: u64,
+    // How often to check if there's compaction work to be done.
+    // Duration::ZERO means automatic compaction is disabled.
+    #[serde(with = "humantime_serde")]
+    pub compaction_period: Duration,
+    // Level0 delta layer threshold for compaction.
+    pub compaction_threshold: usize,
+    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is #of bytes of WAL.
+    // Page versions older than this are garbage collected away.
+    pub gc_horizon: u64,
+    // Interval at which garbage collection is triggered.
+    // Duration::ZERO means automatic GC is disabled
+    #[serde(with = "humantime_serde")]
+    pub gc_period: Duration,
+    // Delta layer churn threshold to create L1 image layers.
+    pub image_creation_threshold: usize,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is time.
+    // Page versions older than this are garbage collected away.
+    #[serde(with = "humantime_serde")]
+    pub pitr_interval: Duration,
+    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Duration,
+    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
+    /// A stalled safekeeper will be changed to a newer one when it appears.
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Duration,
+    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
+    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
+    /// to avoid eager reconnects.
+    pub max_lsn_wal_lag: NonZeroU64,
+    pub eviction_policy: crate::models::EvictionPolicy,
+    pub min_resident_size_override: Option<u64>,
+    // See the corresponding metric's help string.
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Duration,
+
+    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
+    /// may be disabled if a Tenant will not have secondary locations: only secondary
+    /// locations will use the heatmap uploaded by attached locations.
+    #[serde(with = "humantime_serde")]
+    pub heatmap_period: Duration,
+
+    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
+    pub lazy_slru_download: bool,
+
+    pub timeline_get_throttle: crate::models::ThrottleConfig,
+
+    // How much WAL must be ingested before checking again whether a new image layer is required.
+    // Expresed in multiples of checkpoint distance.
+    pub image_layer_creation_check_threshold: u8,
+
+    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
+    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
+    pub switch_aux_file_policy: crate::models::AuxFilePolicy,
+
+    /// The length for an explicit LSN lease request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Duration,
+
+    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Duration,
+}
+
+pub mod defaults {
+    use crate::models::ImageCompressionAlgorithm;
+
+    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
+
+    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
+    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
+
+    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
+
+    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
+    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
+
+    pub const DEFAULT_LOG_FORMAT: &str = "plain";
+
+    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
+
+    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
+
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
+    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
+
+    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+
+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::Zstd { level: Some(1) };
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
+
+    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
+
+    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
+}
+
+impl Default for ConfigToml {
+    fn default() -> Self {
+        use defaults::*;
+
+        Self {
+            listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
+            listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
+            availability_zone: (None),
+            wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
+                .expect("cannot parse default wait lsn timeout")),
+            wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
+                .expect("cannot parse default wal redo timeout")),
+            superuser: (DEFAULT_SUPERUSER.to_string()),
+            page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
+            max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
+            pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
+            http_auth_type: (AuthType::Trust),
+            pg_auth_type: (AuthType::Trust),
+            auth_validation_public_key_path: (None),
+            remote_storage: None,
+            broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
+                .parse()
+                .expect("failed to parse default broker endpoint")),
+            broker_keepalive_interval: (humantime::parse_duration(
+                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
+            )
+            .expect("cannot parse default keepalive interval")),
+            log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
+
+            concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                .expect("Invalid default constant")),
+            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
+            metric_collection_interval: (humantime::parse_duration(
+                DEFAULT_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default metric collection interval")),
+            synthetic_size_calculation_interval: (humantime::parse_duration(
+                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
+            )
+            .expect("cannot parse default synthetic size calculation interval")),
+            metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
+
+            metric_collection_bucket: (None),
+
+            disk_usage_based_eviction: (None),
+
+            test_remote_failures: (0),
+
+            ondemand_download_behavior_treat_error_as_warn: (false),
+
+            background_task_maximum_delay: (humantime::parse_duration(
+                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
+            )
+            .unwrap()),
+
+            control_plane_api: (None),
+            control_plane_api_token: (None),
+            control_plane_emergency_mode: (false),
+
+            heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+            secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
+
+            ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
+
+            virtual_file_io_engine: None,
+
+            max_vectored_read_bytes: (MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            image_compression: (DEFAULT_IMAGE_COMPRESSION),
+            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+            l0_flush: None,
+            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
+
+            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
+
+            tenant_config: TenantConfigToml::default(),
+        }
+    }
+}
+
+pub mod tenant_conf_defaults {
+
+    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
+    // would be more appropriate. But a low value forces the code to be exercised more,
+    // which is good for now to trigger bugs.
+    // This parameter actually determines L0 layer file size.
+    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
+    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
+
+    // FIXME the below configs are only used by legacy algorithm. The new algorithm
+    // has different parameters.
+
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
+
+    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
+    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
+        crate::models::CompactionAlgorithm::Legacy;
+
+    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
+    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
+    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
+    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
+    // throughputs up to 1GiB/s per timeline.
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+    // By default ingest enough WAL for two new L0 layers before checking if new image
+    // image layers should be created.
+    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+}
+
+impl Default for TenantConfigToml {
+    fn default() -> Self {
+        use tenant_conf_defaults::*;
+        Self {
+            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
+                .expect("cannot parse default checkpoint timeout"),
+            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
+            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
+                .expect("cannot parse default compaction period"),
+            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
+            gc_horizon: DEFAULT_GC_HORIZON,
+            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
+                .expect("cannot parse default gc period"),
+            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
+                .expect("cannot parse default PITR interval"),
+            walreceiver_connect_timeout: humantime::parse_duration(
+                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
+            )
+            .expect("cannot parse default walreceiver connect timeout"),
+            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
+                .expect("cannot parse default walreceiver lagging wal timeout"),
+            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
+                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            eviction_policy: crate::models::EvictionPolicy::NoEviction,
+            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            heatmap_period: Duration::ZERO,
+            lazy_slru_download: false,
+            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
+            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
+            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
+            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
+        }
+    }
+}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 87e8f8305a..d13d04eb1b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,6 +6,7 @@ pub use utilization::PageserverUtilization;
 
 use std::{
     collections::HashMap,
+    fmt::Display,
     io::{BufRead, Read},
     num::{NonZeroU32, NonZeroU64, NonZeroUsize},
     str::FromStr,
@@ -435,7 +436,9 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
+)]
 pub enum ImageCompressionAlgorithm {
     // Disabled for writes, support decompressing during read path
     Disabled,
@@ -470,11 +473,33 @@ impl FromStr for ImageCompressionAlgorithm {
     }
 }
 
+impl Display for ImageCompressionAlgorithm {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
+            ImageCompressionAlgorithm::Zstd { level } => {
+                if let Some(level) = level {
+                    write!(f, "zstd({})", level)
+                } else {
+                    write!(f, "zstd")
+                }
+            }
+        }
+    }
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum L0FlushConfig {
+    #[serde(rename_all = "snake_case")]
+    Direct { max_concurrency: NonZeroUsize },
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
@@ -1656,21 +1681,33 @@ mod tests {
     #[test]
     fn test_image_compression_algorithm_parsing() {
         use ImageCompressionAlgorithm::*;
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
-            Disabled
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
-            Zstd { level: None }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
-            Zstd { level: Some(18) }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
-            Zstd { level: Some(-3) }
-        );
+        let cases = [
+            ("disabled", Disabled),
+            ("zstd", Zstd { level: None }),
+            ("zstd(18)", Zstd { level: Some(18) }),
+            ("zstd(-3)", Zstd { level: Some(-3) }),
+        ];
+
+        for (display, expected) in cases {
+            assert_eq!(
+                ImageCompressionAlgorithm::from_str(display).unwrap(),
+                expected,
+                "parsing works"
+            );
+            assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
+
+            let ser = serde_json::to_string(&expected).expect("serialization");
+            assert_eq!(
+                serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
+                expected,
+                "serde roundtrip"
+            );
+
+            assert_eq!(
+                serde_json::Value::String(display.to_string()),
+                serde_json::to_value(expected).unwrap(),
+                "Display is the serde serialization"
+            );
+        }
     }
 }
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index fa3f2cba58..f819a1572a 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -235,6 +235,31 @@ timeout = '5s'";
         );
     }
 
+    #[test]
+    fn test_storage_class_serde_roundtrip() {
+        let classes = [
+            None,
+            Some(StorageClass::Standard),
+            Some(StorageClass::IntelligentTiering),
+        ];
+        for class in classes {
+            #[derive(Serialize, Deserialize)]
+            struct Wrapper {
+                #[serde(
+                    deserialize_with = "deserialize_storage_class",
+                    serialize_with = "serialize_storage_class"
+                )]
+                class: Option<StorageClass>,
+            }
+            let wrapped = Wrapper {
+                class: class.clone(),
+            };
+            let serialized = serde_json::to_string(&wrapped).unwrap();
+            let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap();
+            assert_eq!(class, deserialized.class);
+        }
+    }
+
     #[test]
     fn test_azure_parsing() {
         let toml = "\
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index f7b73dc984..71af43a4da 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,7 +5,9 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};
 
-#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[derive(
+    EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
+)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
     Plain,
@@ -274,6 +276,14 @@ impl From<String> for SecretString {
     }
 }
 
+impl FromStr for SecretString {
+    type Err = std::convert::Infallible;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(Self(s.to_string()))
+    }
+}
+
 impl std::fmt::Debug for SecretString {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "[SECRET]")
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9c02ce3fbc..24373afca3 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
+testing = ["fail/failpoints", "pageserver_api/testing" ]
 
 [dependencies]
 anyhow.workspace = true
@@ -101,6 +101,7 @@ procfs.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
+indoc.workspace = true
 
 [[bench]]
 name = "bench_layer_map"
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 1be4391d81..72cbb6beab 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,7 +4,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
+    config::PageServerConf,
     context::{DownloadBehavior, RequestContext},
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
@@ -167,7 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     virtual_file::init(
         16384,
         virtual_file::io_engine_for_bench(),
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     );
     page_cache::init(conf.page_cache_size);
 
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 8092c203c3..a07107753e 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,7 +4,6 @@
 
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -148,7 +147,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     pageserver::virtual_file::init(
         10,
         virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     );
     pageserver::page_cache::init(100);
 
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index e0f978eaa2..dd753398e2 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -194,7 +193,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             pageserver::virtual_file::init(
                 10,
                 virtual_file::api::IoEngineKind::StdFs,
-                DEFAULT_IO_BUFFER_ALIGNMENT,
+                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
             );
             pageserver::page_cache::init(100);
 
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 7a6c7675bb..3b66b0c4aa 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,14 +20,13 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
-    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
     context::{DownloadBehavior, RequestContext},
     page_cache,
     task_mgr::TaskKind,
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
     virtual_file,
 };
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 850bd87b95..2c60e8d7d1 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -5,6 +5,7 @@
 use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -223,27 +224,15 @@ fn initialize_config(
         }
     };
 
-    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
-        Ok(mut f) => {
-            let md = f.metadata().context("stat config file")?;
-            if md.is_file() {
-                let mut s = String::new();
-                f.read_to_string(&mut s).context("read config file")?;
-                s.parse().context("parse config file toml")?
-            } else {
-                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
-            }
-        }
-        Err(e) => {
-            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
-        }
-    };
-
-    debug!("Using pageserver toml: {config}");
-
-    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
-        .context("Failed to parse pageserver configuration")?;
+    let config_file_contents =
+        std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
+    let config_toml = serde_path_to_error::deserialize(
+        toml_edit::de::Deserializer::from_str(&config_file_contents)
+            .context("build toml deserializer")?,
+    )
+    .context("deserialize config toml")?;
+    let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
+        .context("runtime-validation of config toml")?;
 
     Ok(Box::leak(Box::new(conf)))
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9e4530ba3c..c159b66905 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -4,11 +4,13 @@
 //! file, or on the command line.
 //! See also `settings.md` for better description on every parameter.
 
-use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
+use anyhow::{bail, ensure, Context};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use pageserver_api::{
+    config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes},
+    shard::TenantShardId,
+};
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use serde::de::IntoDeserializer;
-use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -17,10 +19,8 @@ use utils::logging::SecretString;
 use once_cell::sync::OnceCell;
 use reqwest::Url;
 use std::num::NonZeroUsize;
-use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use toml_edit::{Document, Item};
 
 use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
@@ -29,139 +29,27 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::l0_flush::L0FlushConfig;
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
-use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
-use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{tenant::config::TenantConf, virtual_file};
+use crate::virtual_file;
+use crate::virtual_file::io_engine;
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
-use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
-
-use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
-
-pub mod defaults {
-    use crate::tenant::config::defaults::*;
-    use const_format::formatcp;
-
-    pub use pageserver_api::config::{
-        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
-        DEFAULT_PG_LISTEN_PORT,
-    };
-    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
-
-    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
-    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
-
-    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-
-    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
-    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
-
-    pub const DEFAULT_LOG_FORMAT: &str = "plain";
-
-    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
-
-    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
-        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
-
-    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
-    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
-
-    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-
-    #[cfg(target_os = "linux")]
-    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
-
-    #[cfg(not(target_os = "linux"))]
-    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
-
-    pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored";
-
-    pub const DEFAULT_GET_IMPL: &str = "vectored";
-
-    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
-
-    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
-
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
-
-    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-
-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
-
-    ///
-    /// Default built-in configuration file.
-    ///
-    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
-        r#"
-# Initial configuration file created by 'pageserver --init'
-#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
-#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
-
-#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
-#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
-
-#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
-#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
-
-# initial superuser role name to use when creating a new tenant
-#initial_superuser_name = '{DEFAULT_SUPERUSER}'
-
-#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
-
-#log_format = '{DEFAULT_LOG_FORMAT}'
-
-#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
-#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
-
-#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
-#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
-
-#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
-
-#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
-
-#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
-
-#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
-
-#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
-
-[tenant_config]
-#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
-#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
-#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
-#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
-
-#gc_period = '{DEFAULT_GC_PERIOD}'
-#gc_horizon = {DEFAULT_GC_HORIZON}
-#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
-#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
-
-#min_resident_size_override = .. # in bytes
-#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-
-#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
-#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
-
-#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
-
-#[remote_storage]
-
-"#
-    );
-}
-
+/// Global state of pageserver.
+///
+/// It's mostly immutable configuration, but some semaphores and the
+/// like crept in over time and the name stuck.
+///
+/// Instantiated by deserializing `pageserver.toml` into  [`pageserver_api::config::ConfigToml`]
+/// and passing that to [`PageServerConf::parse_and_validate`].
+///
+/// # Adding a New Field
+///
+/// 1. Add the field to `pageserver_api::config::ConfigToml`.
+/// 2. Fix compiler errors (exhaustive destructuring will guide you).
+///
+/// For fields that require additional validation or filling in of defaults at runtime,
+/// check for examples in the [`PageServerConf::parse_and_validate`] method.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct PageServerConf {
     // Identifier of that particular pageserver so e g safekeepers
@@ -207,7 +95,7 @@ pub struct PageServerConf {
 
     pub remote_storage_config: Option<RemoteStorageConfig>,
 
-    pub default_tenant_conf: TenantConf,
+    pub default_tenant_conf: crate::tenant::config::TenantConf,
 
     /// Storage broker endpoints to connect to.
     pub broker_endpoint: Uri,
@@ -284,11 +172,11 @@ pub struct PageServerConf {
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
 
-    pub l0_flush: L0FlushConfig,
+    pub l0_flush: crate::l0_flush::L0FlushConfig,
 
     /// This flag is temporary and will be removed after gradual rollout.
     /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
 
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
@@ -304,472 +192,6 @@ pub struct PageServerConf {
 /// startup code to the connection code through a dozen layers.
 pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
-// use dedicated enum for builder to better indicate the intention
-// and avoid possible confusion with nested options
-#[derive(Clone, Default)]
-pub enum BuilderValue<T> {
-    Set(T),
-    #[default]
-    NotSet,
-}
-
-impl<T: Clone> BuilderValue<T> {
-    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
-        match self {
-            Self::Set(v) => Ok(v.clone()),
-            Self::NotSet => match default {
-                BuilderValue::Set(v) => Ok(v.clone()),
-                BuilderValue::NotSet => {
-                    anyhow::bail!("missing config value {field_name:?}")
-                }
-            },
-        }
-    }
-}
-
-// needed to simplify config construction
-#[derive(Default)]
-struct PageServerConfigBuilder {
-    listen_pg_addr: BuilderValue<String>,
-
-    listen_http_addr: BuilderValue<String>,
-
-    availability_zone: BuilderValue<Option<String>>,
-
-    wait_lsn_timeout: BuilderValue<Duration>,
-    wal_redo_timeout: BuilderValue<Duration>,
-
-    superuser: BuilderValue<String>,
-
-    page_cache_size: BuilderValue<usize>,
-    max_file_descriptors: BuilderValue<usize>,
-
-    workdir: BuilderValue<Utf8PathBuf>,
-
-    pg_distrib_dir: BuilderValue<Utf8PathBuf>,
-
-    http_auth_type: BuilderValue<AuthType>,
-    pg_auth_type: BuilderValue<AuthType>,
-
-    //
-    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
-    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
-
-    broker_endpoint: BuilderValue<Uri>,
-    broker_keepalive_interval: BuilderValue<Duration>,
-
-    log_format: BuilderValue<LogFormat>,
-
-    concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
-    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
-
-    metric_collection_interval: BuilderValue<Duration>,
-    metric_collection_endpoint: BuilderValue<Option<Url>>,
-    synthetic_size_calculation_interval: BuilderValue<Duration>,
-    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
-
-    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
-
-    test_remote_failures: BuilderValue<u64>,
-
-    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
-
-    background_task_maximum_delay: BuilderValue<Duration>,
-
-    control_plane_api: BuilderValue<Option<Url>>,
-    control_plane_api_token: BuilderValue<Option<SecretString>>,
-    control_plane_emergency_mode: BuilderValue<bool>,
-
-    heatmap_upload_concurrency: BuilderValue<usize>,
-    secondary_download_concurrency: BuilderValue<usize>,
-
-    ingest_batch_size: BuilderValue<u64>,
-
-    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
-
-    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
-
-    image_compression: BuilderValue<ImageCompressionAlgorithm>,
-
-    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
-
-    l0_flush: BuilderValue<L0FlushConfig>,
-
-    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
-
-    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
-
-    io_buffer_alignment: BuilderValue<usize>,
-}
-
-impl PageServerConfigBuilder {
-    fn new() -> Self {
-        Self::default()
-    }
-
-    #[inline(always)]
-    fn default_values() -> Self {
-        use self::BuilderValue::*;
-        use defaults::*;
-        Self {
-            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
-            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
-            availability_zone: Set(None),
-            wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
-                .expect("cannot parse default wait lsn timeout")),
-            wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
-                .expect("cannot parse default wal redo timeout")),
-            superuser: Set(DEFAULT_SUPERUSER.to_string()),
-            page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
-            max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
-            workdir: Set(Utf8PathBuf::new()),
-            pg_distrib_dir: Set(Utf8PathBuf::from_path_buf(
-                env::current_dir().expect("cannot access current directory"),
-            )
-            .expect("non-Unicode path")
-            .join("pg_install")),
-            http_auth_type: Set(AuthType::Trust),
-            pg_auth_type: Set(AuthType::Trust),
-            auth_validation_public_key_path: Set(None),
-            remote_storage_config: Set(None),
-            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
-                .parse()
-                .expect("failed to parse default broker endpoint")),
-            broker_keepalive_interval: Set(humantime::parse_duration(
-                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
-            )
-            .expect("cannot parse default keepalive interval")),
-            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
-
-            concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: Set(
-                ConfigurableSemaphore::DEFAULT_INITIAL,
-            ),
-            metric_collection_interval: Set(humantime::parse_duration(
-                DEFAULT_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default metric collection interval")),
-            synthetic_size_calculation_interval: Set(humantime::parse_duration(
-                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
-            )
-            .expect("cannot parse default synthetic size calculation interval")),
-            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
-
-            metric_collection_bucket: Set(None),
-
-            disk_usage_based_eviction: Set(None),
-
-            test_remote_failures: Set(0),
-
-            ondemand_download_behavior_treat_error_as_warn: Set(false),
-
-            background_task_maximum_delay: Set(humantime::parse_duration(
-                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
-            )
-            .unwrap()),
-
-            control_plane_api: Set(None),
-            control_plane_api_token: Set(None),
-            control_plane_emergency_mode: Set(false),
-
-            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
-
-            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
-
-            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
-
-            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
-                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
-            )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
-            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-            l0_flush: Set(L0FlushConfig::default()),
-            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
-            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
-            io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
-        }
-    }
-}
-
-impl PageServerConfigBuilder {
-    pub fn listen_pg_addr(&mut self, listen_pg_addr: String) {
-        self.listen_pg_addr = BuilderValue::Set(listen_pg_addr)
-    }
-
-    pub fn listen_http_addr(&mut self, listen_http_addr: String) {
-        self.listen_http_addr = BuilderValue::Set(listen_http_addr)
-    }
-
-    pub fn availability_zone(&mut self, availability_zone: Option<String>) {
-        self.availability_zone = BuilderValue::Set(availability_zone)
-    }
-
-    pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
-        self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
-    }
-
-    pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) {
-        self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout)
-    }
-
-    pub fn superuser(&mut self, superuser: String) {
-        self.superuser = BuilderValue::Set(superuser)
-    }
-
-    pub fn page_cache_size(&mut self, page_cache_size: usize) {
-        self.page_cache_size = BuilderValue::Set(page_cache_size)
-    }
-
-    pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) {
-        self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
-    }
-
-    pub fn workdir(&mut self, workdir: Utf8PathBuf) {
-        self.workdir = BuilderValue::Set(workdir)
-    }
-
-    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) {
-        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
-    }
-
-    pub fn http_auth_type(&mut self, auth_type: AuthType) {
-        self.http_auth_type = BuilderValue::Set(auth_type)
-    }
-
-    pub fn pg_auth_type(&mut self, auth_type: AuthType) {
-        self.pg_auth_type = BuilderValue::Set(auth_type)
-    }
-
-    pub fn auth_validation_public_key_path(
-        &mut self,
-        auth_validation_public_key_path: Option<Utf8PathBuf>,
-    ) {
-        self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
-    }
-
-    pub fn remote_storage_config(&mut self, remote_storage_config: Option<RemoteStorageConfig>) {
-        self.remote_storage_config = BuilderValue::Set(remote_storage_config)
-    }
-
-    pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
-        self.broker_endpoint = BuilderValue::Set(broker_endpoint)
-    }
-
-    pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
-        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
-    }
-
-    pub fn log_format(&mut self, log_format: LogFormat) {
-        self.log_format = BuilderValue::Set(log_format)
-    }
-
-    pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
-        self.concurrent_tenant_warmup = BuilderValue::Set(u);
-    }
-
-    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
-        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
-    }
-
-    pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
-        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
-    }
-
-    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
-        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
-    }
-
-    pub fn metric_collection_bucket(
-        &mut self,
-        metric_collection_bucket: Option<RemoteStorageConfig>,
-    ) {
-        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
-    }
-
-    pub fn synthetic_size_calculation_interval(
-        &mut self,
-        synthetic_size_calculation_interval: Duration,
-    ) {
-        self.synthetic_size_calculation_interval =
-            BuilderValue::Set(synthetic_size_calculation_interval)
-    }
-
-    pub fn test_remote_failures(&mut self, fail_first: u64) {
-        self.test_remote_failures = BuilderValue::Set(fail_first);
-    }
-
-    pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
-        self.disk_usage_based_eviction = BuilderValue::Set(value);
-    }
-
-    pub fn ondemand_download_behavior_treat_error_as_warn(
-        &mut self,
-        ondemand_download_behavior_treat_error_as_warn: bool,
-    ) {
-        self.ondemand_download_behavior_treat_error_as_warn =
-            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
-    }
-
-    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
-        self.background_task_maximum_delay = BuilderValue::Set(delay);
-    }
-
-    pub fn control_plane_api(&mut self, api: Option<Url>) {
-        self.control_plane_api = BuilderValue::Set(api)
-    }
-
-    pub fn control_plane_api_token(&mut self, token: Option<SecretString>) {
-        self.control_plane_api_token = BuilderValue::Set(token)
-    }
-
-    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
-        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
-    }
-
-    pub fn heatmap_upload_concurrency(&mut self, value: usize) {
-        self.heatmap_upload_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn secondary_download_concurrency(&mut self, value: usize) {
-        self.secondary_download_concurrency = BuilderValue::Set(value)
-    }
-
-    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
-        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
-    }
-
-    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
-        self.virtual_file_io_engine = BuilderValue::Set(value);
-    }
-
-    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
-        self.max_vectored_read_bytes = BuilderValue::Set(value);
-    }
-
-    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
-        self.image_compression = BuilderValue::Set(value);
-    }
-
-    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
-        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
-    }
-
-    pub fn l0_flush(&mut self, value: L0FlushConfig) {
-        self.l0_flush = BuilderValue::Set(value);
-    }
-
-    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
-        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
-    }
-
-    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
-        self.virtual_file_direct_io = BuilderValue::Set(value);
-    }
-
-    pub fn io_buffer_alignment(&mut self, value: usize) {
-        self.io_buffer_alignment = BuilderValue::Set(value);
-    }
-
-    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
-        let default = Self::default_values();
-
-        macro_rules! conf {
-            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
-                PageServerConf {
-                    $(
-                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
-                    )*
-                    $(
-                        $custom_field: $custom_value,
-                    )*
-                }
-            };
-        }
-
-        Ok(conf!(
-            USING DEFAULT
-            {
-                listen_pg_addr,
-                listen_http_addr,
-                availability_zone,
-                wait_lsn_timeout,
-                wal_redo_timeout,
-                superuser,
-                page_cache_size,
-                max_file_descriptors,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type,
-                pg_auth_type,
-                auth_validation_public_key_path,
-                remote_storage_config,
-                broker_endpoint,
-                broker_keepalive_interval,
-                log_format,
-                metric_collection_interval,
-                metric_collection_endpoint,
-                metric_collection_bucket,
-                synthetic_size_calculation_interval,
-                disk_usage_based_eviction,
-                test_remote_failures,
-                ondemand_download_behavior_treat_error_as_warn,
-                background_task_maximum_delay,
-                control_plane_api,
-                control_plane_api_token,
-                control_plane_emergency_mode,
-                heatmap_upload_concurrency,
-                secondary_download_concurrency,
-                ingest_batch_size,
-                max_vectored_read_bytes,
-                image_compression,
-                ephemeral_bytes_per_memory_kb,
-                l0_flush,
-                compact_level0_phase1_value_access,
-                virtual_file_direct_io,
-                io_buffer_alignment,
-            }
-            CUSTOM LOGIC
-            {
-                id: id,
-                // TenantConf is handled separately
-                default_tenant_conf: TenantConf::default(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new({
-                    self
-                        .concurrent_tenant_warmup
-                        .ok_or("concurrent_tenant_warmpup",
-                               default.concurrent_tenant_warmup)?
-                }),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("concurrent_tenant_size_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?
-                ),
-                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                    // re-use `concurrent_tenant_size_logical_size_queries`
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?,
-                ),
-                virtual_file_io_engine: match self.virtual_file_io_engine {
-                    BuilderValue::Set(v) => v,
-                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
-                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
-                        io_engine::FeatureTestResult::Worse { engine, remark } => {
-                            // TODO: bubble this up to the caller so we can tracing::warn! it.
-                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
-                            engine
-                        }
-                    },
-                },
-            }
-        ))
-    }
-}
-
 impl PageServerConf {
     //
     // Repository paths, relative to workdir.
@@ -878,134 +300,135 @@ impl PageServerConf {
     ///
     /// This leaves any options not present in the file in the built-in defaults.
     pub fn parse_and_validate(
-        node_id: NodeId,
-        toml: &Document,
+        id: NodeId,
+        config_toml: pageserver_api::config::ConfigToml,
         workdir: &Utf8Path,
     ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new();
-        builder.workdir(workdir.to_owned());
+        let pageserver_api::config::ConfigToml {
+            listen_pg_addr,
+            listen_http_addr,
+            availability_zone,
+            wait_lsn_timeout,
+            wal_redo_timeout,
+            superuser,
+            page_cache_size,
+            max_file_descriptors,
+            pg_distrib_dir,
+            http_auth_type,
+            pg_auth_type,
+            auth_validation_public_key_path,
+            remote_storage,
+            broker_endpoint,
+            broker_keepalive_interval,
+            log_format,
+            metric_collection_interval,
+            metric_collection_endpoint,
+            metric_collection_bucket,
+            synthetic_size_calculation_interval,
+            disk_usage_based_eviction,
+            test_remote_failures,
+            ondemand_download_behavior_treat_error_as_warn,
+            background_task_maximum_delay,
+            control_plane_api,
+            control_plane_api_token,
+            control_plane_emergency_mode,
+            heatmap_upload_concurrency,
+            secondary_download_concurrency,
+            ingest_batch_size,
+            max_vectored_read_bytes,
+            image_compression,
+            ephemeral_bytes_per_memory_kb,
+            compact_level0_phase1_value_access,
+            l0_flush,
+            virtual_file_direct_io,
+            concurrent_tenant_warmup,
+            concurrent_tenant_size_logical_size_queries,
+            virtual_file_io_engine,
+            io_buffer_alignment,
+            tenant_config,
+        } = config_toml;
 
-        let mut t_conf = TenantConfOpt::default();
+        let mut conf = PageServerConf {
+            // ------------------------------------------------------------
+            // fields that are already fully validated by the ConfigToml Deserialize impl
+            // ------------------------------------------------------------
+            listen_pg_addr,
+            listen_http_addr,
+            availability_zone,
+            wait_lsn_timeout,
+            wal_redo_timeout,
+            superuser,
+            page_cache_size,
+            max_file_descriptors,
+            http_auth_type,
+            pg_auth_type,
+            auth_validation_public_key_path,
+            remote_storage_config: remote_storage,
+            broker_endpoint,
+            broker_keepalive_interval,
+            log_format,
+            metric_collection_interval,
+            metric_collection_endpoint,
+            metric_collection_bucket,
+            synthetic_size_calculation_interval,
+            disk_usage_based_eviction,
+            test_remote_failures,
+            ondemand_download_behavior_treat_error_as_warn,
+            background_task_maximum_delay,
+            control_plane_api,
+            control_plane_emergency_mode,
+            heatmap_upload_concurrency,
+            secondary_download_concurrency,
+            ingest_batch_size,
+            max_vectored_read_bytes,
+            image_compression,
+            ephemeral_bytes_per_memory_kb,
+            compact_level0_phase1_value_access,
+            virtual_file_direct_io,
+            io_buffer_alignment,
 
-        for (key, item) in toml.iter() {
-            match key {
-                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
-                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
-                "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)),
-                "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
-                "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
-                "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
-                "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize),
-                "max_file_descriptors" => {
-                    builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
-                }
-                "pg_distrib_dir" => {
-                    builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?))
-                }
-                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
-                    Utf8PathBuf::from(parse_toml_string(key, item)?),
-                )),
-                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
-                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
-                "remote_storage" => {
-                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
-                }
-                "tenant_config" => {
-                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
-                }
-                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
-                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
-                "log_format" => builder.log_format(
-                    LogFormat::from_config(&parse_toml_string(key, item)?)?
-                ),
-                "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
-                }),
-                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
-                }),
-                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
-                "metric_collection_endpoint" => {
-                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
-                    builder.metric_collection_endpoint(Some(endpoint));
-                },
-                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
-                }
-                "synthetic_size_calculation_interval" =>
-                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
-                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
-                "disk_usage_based_eviction" => {
-                    tracing::info!("disk_usage_based_eviction: {:#?}", &item);
-                    builder.disk_usage_based_eviction(
-                        deserialize_from_item("disk_usage_based_eviction", item)
-                            .context("parse disk_usage_based_eviction")?
-                    )
-                },
-                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
-                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
-                "control_plane_api" => {
-                    let parsed = parse_toml_string(key, item)?;
-                    if parsed.is_empty() {
-                        builder.control_plane_api(None)
-                    } else {
-                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
+            // ------------------------------------------------------------
+            // fields that require additional validation or custom handling
+            // ------------------------------------------------------------
+            workdir: workdir.to_owned(),
+            pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| {
+                std::env::current_dir()
+                    .expect("current_dir() failed")
+                    .try_into()
+                    .expect("current_dir() is not a valid Utf8Path")
+            }),
+            control_plane_api_token: control_plane_api_token.map(SecretString::from),
+            id,
+            default_tenant_conf: tenant_config,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                // re-use `concurrent_tenant_size_logical_size_queries`
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            virtual_file_io_engine: match virtual_file_io_engine {
+                Some(v) => v,
+                None => match crate::virtual_file::io_engine_feature_test()
+                    .context("auto-detect virtual_file_io_engine")?
+                {
+                    io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
+                    io_engine::FeatureTestResult::Worse { engine, remark } => {
+                        // TODO: bubble this up to the caller so we can tracing::warn! it.
+                        eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
+                        engine
                     }
                 },
-                "control_plane_api_token" => {
-                    let parsed = parse_toml_string(key, item)?;
-                    if parsed.is_empty() {
-                        builder.control_plane_api_token(None)
-                    } else {
-                        builder.control_plane_api_token(Some(parsed.into()))
-                    }
-                },
-                "control_plane_emergency_mode" => {
-                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-                },
-                "heatmap_upload_concurrency" => {
-                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "secondary_download_concurrency" => {
-                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
-                },
-                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
-                "virtual_file_io_engine" => {
-                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
-                }
-                "max_vectored_read_bytes" => {
-                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
-                    builder.get_max_vectored_read_bytes(
-                        MaxVectoredReadBytes(
-                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
-                }
-                "image_compression" => {
-                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
-                }
-                "ephemeral_bytes_per_memory_kb" => {
-                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
-                }
-                "l0_flush" => {
-                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
-                }
-                "compact_level0_phase1_value_access" => {
-                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
-                }
-                "virtual_file_direct_io" => {
-                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
-                }
-                "io_buffer_alignment" => {
-                    builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
-                }
-                _ => bail!("unrecognized pageserver option '{key}'"),
-            }
-        }
+            },
+            l0_flush: l0_flush
+                .map(crate::l0_flush::L0FlushConfig::from)
+                .unwrap_or_default(),
+        };
 
-        let mut conf = builder.build(node_id).context("invalid config")?;
+        // ------------------------------------------------------------
+        // custom validation code that covers more than one field in isolation
+        // ------------------------------------------------------------
 
         if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
             let auth_validation_public_key_path = conf
@@ -1019,10 +442,8 @@ impl PageServerConf {
             );
         }
 
-        conf.default_tenant_conf = t_conf.merge(TenantConf::default());
-
         IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
-            .map_err(|msg| anyhow::anyhow!("{msg}"))
+            .map_err(anyhow::Error::msg)
             .with_context(|| {
                 format!(
                     "effective checkpoint distance is unsupported: {}",
@@ -1042,130 +463,25 @@ impl PageServerConf {
     pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
         let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
-        PageServerConf {
-            id: NodeId(0),
+        let config_toml = pageserver_api::config::ConfigToml {
             wait_lsn_timeout: Duration::from_secs(60),
             wal_redo_timeout: Duration::from_secs(60),
-            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
-            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
-            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-            availability_zone: None,
-            superuser: "cloud_admin".to_string(),
-            workdir: repo_dir,
-            pg_distrib_dir,
-            http_auth_type: AuthType::Trust,
-            pg_auth_type: AuthType::Trust,
-            auth_validation_public_key_path: None,
-            remote_storage_config: None,
-            default_tenant_conf: TenantConf::default(),
-            broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-            broker_keepalive_interval: Duration::from_secs(5000),
-            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                    .expect("Invalid default constant"),
-            ),
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
-            ),
+            pg_distrib_dir: Some(pg_distrib_dir),
             metric_collection_interval: Duration::from_secs(60),
-            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-            metric_collection_bucket: None,
             synthetic_size_calculation_interval: Duration::from_secs(60),
-            disk_usage_based_eviction: None,
-            test_remote_failures: 0,
-            ondemand_download_behavior_treat_error_as_warn: false,
             background_task_maximum_delay: Duration::ZERO,
-            control_plane_api: None,
-            control_plane_api_token: None,
-            control_plane_emergency_mode: false,
-            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-            max_vectored_read_bytes: MaxVectoredReadBytes(
-                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                    .expect("Invalid default constant"),
-            ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-            l0_flush: L0FlushConfig::default(),
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-            io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-        }
+            ..Default::default()
+        };
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
     }
 }
 
-#[derive(Deserialize)]
+#[derive(serde::Deserialize, serde::Serialize)]
 #[serde(deny_unknown_fields)]
 pub struct PageserverIdentity {
     pub id: NodeId,
 }
 
-// Helper functions to parse a toml Item
-
-fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    Ok(s.to_string())
-}
-
-fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
-    // A toml integer is signed, so it cannot represent the full range of an u64. That's OK
-    // for our use, though.
-    let i: i64 = item
-        .as_integer()
-        .with_context(|| format!("configure option {name} is not an integer"))?;
-    if i < 0 {
-        bail!("configure option {name} cannot be negative");
-    }
-    Ok(i as u64)
-}
-
-fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
-    item.as_bool()
-        .with_context(|| format!("configure option {name} is not a bool"))
-}
-
-fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-
-    Ok(humantime::parse_duration(s)?)
-}
-
-fn parse_toml_from_str<T>(name: &str, item: &Item) -> anyhow::Result<T>
-where
-    T: FromStr,
-    <T as FromStr>::Err: std::fmt::Display,
-{
-    let v = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    T::from_str(v).map_err(|e| {
-        anyhow!(
-            "Failed to parse string as {parse_type} for configure option {name}: {e}",
-            parse_type = stringify!(T)
-        )
-    })
-}
-
-fn deserialize_from_item<T>(name: &str, item: &Item) -> anyhow::Result<T>
-where
-    T: serde::de::DeserializeOwned,
-{
-    // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way
-    let deserializer = match item.clone().into_value() {
-        Ok(value) => value.into_deserializer(),
-        Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"),
-    };
-    T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}"))
-}
-
 /// Configurable semaphore permits setting.
 ///
 /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -1227,469 +543,108 @@ impl ConfigurableSemaphore {
 
 #[cfg(test)]
 mod tests {
-    use std::{fs, num::NonZeroU32};
 
-    use camino_tempfile::{tempdir, Utf8TempDir};
-    use pageserver_api::models::EvictionPolicy;
-    use remote_storage::{RemoteStorageKind, S3Config};
-    use utils::serde_percent::Percent;
+    use camino::Utf8PathBuf;
+    use utils::id::NodeId;
 
-    use super::*;
-    use crate::DEFAULT_PG_VERSION;
-
-    const ALL_BASE_VALUES_TOML: &str = r#"
-# Initial configuration file created by 'pageserver --init'
-
-listen_pg_addr = '127.0.0.1:64000'
-listen_http_addr = '127.0.0.1:9898'
-
-wait_lsn_timeout = '111 s'
-wal_redo_timeout = '111 s'
-
-page_cache_size = 444
-max_file_descriptors = 333
-
-# initial superuser role name to use when creating a new tenant
-initial_superuser_name = 'zzzz'
-
-metric_collection_interval = '222 s'
-metric_collection_endpoint = 'http://localhost:80/metrics'
-synthetic_size_calculation_interval = '333 s'
-
-log_format = 'json'
-background_task_maximum_delay = '334 s'
-
-"#;
+    use super::PageServerConf;
 
     #[test]
-    fn parse_defaults() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
-        // we have to create dummy values to overcome the validation errors
-        let config_string =
-            format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
-        let toml = config_string.parse()?;
-
-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
-
-        assert_eq!(
-            parsed_config,
-            PageServerConf {
-                id: NodeId(10),
-                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
-                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-                availability_zone: None,
-                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
-                wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
-                superuser: defaults::DEFAULT_SUPERUSER.to_string(),
-                page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-                max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type: AuthType::Trust,
-                pg_auth_type: AuthType::Trust,
-                auth_validation_public_key_path: None,
-                remote_storage_config: None,
-                default_tenant_conf: TenantConf::default(),
-                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-                broker_keepalive_interval: humantime::parse_duration(
-                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
-                )?,
-                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-                eviction_task_immitated_concurrent_logical_size_queries:
-                    ConfigurableSemaphore::default(),
-                metric_collection_interval: humantime::parse_duration(
-                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
-                )?,
-                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-                metric_collection_bucket: None,
-                synthetic_size_calculation_interval: humantime::parse_duration(
-                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
-                )?,
-                disk_usage_based_eviction: None,
-                test_remote_failures: 0,
-                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: humantime::parse_duration(
-                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
-                )?,
-                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            },
-            "Correct defaults should be used when no config values are provided"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_basic_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
-
-        let config_string = format!(
-            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",
-        );
-        let toml = config_string.parse()?;
-
-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
-
-        assert_eq!(
-            parsed_config,
-            PageServerConf {
-                id: NodeId(10),
-                listen_pg_addr: "127.0.0.1:64000".to_string(),
-                listen_http_addr: "127.0.0.1:9898".to_string(),
-                availability_zone: None,
-                wait_lsn_timeout: Duration::from_secs(111),
-                wal_redo_timeout: Duration::from_secs(111),
-                superuser: "zzzz".to_string(),
-                page_cache_size: 444,
-                max_file_descriptors: 333,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type: AuthType::Trust,
-                pg_auth_type: AuthType::Trust,
-                auth_validation_public_key_path: None,
-                remote_storage_config: None,
-                default_tenant_conf: TenantConf::default(),
-                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
-                broker_keepalive_interval: Duration::from_secs(5),
-                log_format: LogFormat::Json,
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
-                eviction_task_immitated_concurrent_logical_size_queries:
-                    ConfigurableSemaphore::default(),
-                metric_collection_interval: Duration::from_secs(222),
-                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
-                metric_collection_bucket: None,
-                synthetic_size_calculation_interval: Duration::from_secs(333),
-                disk_usage_based_eviction: None,
-                test_remote_failures: 0,
-                ondemand_download_behavior_treat_error_as_warn: false,
-                background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
-                ingest_batch_size: 100,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
-                max_vectored_read_bytes: MaxVectoredReadBytes(
-                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
-                        .expect("Invalid default constant")
-                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
-                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            },
-            "Should be able to parse all basic config values correctly"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_remote_fs_storage_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-        let broker_endpoint = "http://127.0.0.1:7777";
-
-        let local_storage_path = tempdir.path().join("local_remote_storage");
-
-        let identical_toml_declarations = &[
-            format!(
-                r#"[remote_storage]
-local_path = '{local_storage_path}'"#,
-            ),
-            format!("remote_storage={{local_path='{local_storage_path}'}}"),
-        ];
-
-        for remote_storage_config_str in identical_toml_declarations {
-            let config_string = format!(
-                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-{remote_storage_config_str}"#,
-            );
-
-            let toml = config_string.parse()?;
-
-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for the local FS");
-
-            assert_eq!(
-                parsed_remote_storage_config,
-                RemoteStorageConfig {
-                    storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
-                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
-                },
-                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
-            );
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn parse_remote_s3_storage_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let bucket_name = "some-sample-bucket".to_string();
-        let bucket_region = "eu-north-1".to_string();
-        let prefix_in_bucket = "test_prefix".to_string();
-        let endpoint = "http://localhost:5000".to_string();
-        let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
-        let max_sync_errors = NonZeroU32::new(222).unwrap();
-        let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
-        let broker_endpoint = "http://127.0.0.1:7777";
-
-        let identical_toml_declarations = &[
-            format!(
-                r#"[remote_storage]
-max_concurrent_syncs = {max_concurrent_syncs}
-max_sync_errors = {max_sync_errors}
-bucket_name = '{bucket_name}'
-bucket_region = '{bucket_region}'
-prefix_in_bucket = '{prefix_in_bucket}'
-endpoint = '{endpoint}'
-concurrency_limit = {s3_concurrency_limit}"#
-            ),
-            format!(
-                "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
-                bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
-            ),
-        ];
-
-        for remote_storage_config_str in identical_toml_declarations {
-            let config_string = format!(
-                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-{remote_storage_config_str}"#,
-            );
-
-            let toml = config_string.parse()?;
-
-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for S3");
-
-            assert_eq!(
-                parsed_remote_storage_config,
-                RemoteStorageConfig {
-                    storage: RemoteStorageKind::AwsS3(S3Config {
-                        bucket_name: bucket_name.clone(),
-                        bucket_region: bucket_region.clone(),
-                        prefix_in_bucket: Some(prefix_in_bucket.clone()),
-                        endpoint: Some(endpoint.clone()),
-                        concurrency_limit: s3_concurrency_limit,
-                        max_keys_per_list_response: None,
-                        upload_storage_class: None,
-                    }),
-                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
-                },
-                "Remote storage config should correctly parse the S3 config"
-            );
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
-        let config_string = r#"
-            [tenant_config]
-            checkpoint_distance = -1 # supposed to be an u64
-        "#
-        .to_string();
-
-        let toml: Document = config_string.parse()?;
-        let item = toml.get("tenant_config").unwrap();
-        let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
-
-        let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
-        assert_eq!(error.to_string(), expected_error_str);
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_override_tenant_config() -> anyhow::Result<()> {
-        let config_string = r#"tenant_config={ min_resident_size_override =  400 }"#.to_string();
-
-        let toml: Document = config_string.parse()?;
-        let item = toml.get("tenant_config").unwrap();
-        let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
-
-        assert_eq!(conf.min_resident_size_override, Some(400));
-
-        Ok(())
-    }
-
-    #[test]
-    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{pg_distrib_dir}"
-metric_collection_endpoint = "http://sample.url"
-metric_collection_interval = "10min"
-
-[disk_usage_based_eviction]
-max_usage_pct = 80
-min_avail_bytes = 0
-period = "10s"
-
-[tenant_config]
-evictions_low_residence_duration_metric_threshold = "20m"
-
-[tenant_config.eviction_policy]
-kind = "LayerAccessThreshold"
-period = "20m"
-threshold = "20m"
-"#,
-        );
-        let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
-
-        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
-        assert_eq!(
-            conf.metric_collection_endpoint,
-            Some("http://sample.url".parse().unwrap())
-        );
-        assert_eq!(
-            conf.metric_collection_interval,
-            Duration::from_secs(10 * 60)
-        );
-        assert_eq!(
-            conf.default_tenant_conf
-                .evictions_low_residence_duration_metric_threshold,
-            Duration::from_secs(20 * 60)
-        );
-
-        // Assert that the node id provided by the indentity file (threaded
-        // through the call to [`PageServerConf::parse_and_validate`] is
-        // used.
-        assert_eq!(conf.id, NodeId(333));
-        assert_eq!(
-            conf.disk_usage_based_eviction,
-            Some(DiskUsageEvictionTaskConfig {
-                max_usage_pct: Percent::new(80).unwrap(),
-                min_avail_bytes: 0,
-                period: Duration::from_secs(10),
-                #[cfg(feature = "testing")]
-                mock_statvfs: None,
-                eviction_order: Default::default(),
-            })
-        );
-
-        match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
-                assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
-                assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
-            }
-            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn parse_imitation_only_pageserver_config() {
-        let tempdir = tempdir().unwrap();
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
-
-        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{pg_distrib_dir}"
-metric_collection_endpoint = "http://sample.url"
-metric_collection_interval = "10min"
-
-[tenant_config]
-evictions_low_residence_duration_metric_threshold = "20m"
-
-[tenant_config.eviction_policy]
-kind = "OnlyImitiate"
-period = "20m"
-threshold = "20m"
-"#,
-        );
-        let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
-
-        match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::OnlyImitiate(t) => {
-                assert_eq!(t.period, Duration::from_secs(20 * 60));
-                assert_eq!(t.threshold, Duration::from_secs(20 * 60));
-            }
-            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
-        }
-    }
-
-    #[test]
-    fn empty_remote_storage_is_error() {
-        let tempdir = tempdir().unwrap();
-        let (workdir, _) = prepare_fs(&tempdir).unwrap();
+    fn test_empty_config_toml_is_valid() {
+        // we use Default impl of everything in this situation
         let input = r#"
-remote_storage = {}
         "#;
-        let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
-            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
-        assert!(format!("{err}").contains("remote_storage"), "{err}");
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("empty config is valid");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect("parse_and_validate");
     }
 
-    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
-        let tempdir_path = tempdir.path();
+    /// If there's a typo in the pageserver config, we'd rather catch that typo
+    /// and fail pageserver startup than silently ignoring the typo, leaving whoever
+    /// made it in the believe that their config change is effective.
+    ///
+    /// The default in serde is to allow unknown fields, so, we rely
+    /// on developer+review discipline to add `deny_unknown_fields` when adding
+    /// new structs to the config, and these tests here as a regression test.
+    ///
+    /// The alternative to all of this would be to allow unknown fields in the config.
+    /// To catch them, we could have a config check tool or mgmt API endpoint that
+    /// compares the effective config with the TOML on disk and makes sure that
+    /// the on-disk TOML is a strict subset of the effective config.
+    mod unknown_fields_handling {
+        macro_rules! test {
+            ($short_name:ident, $input:expr) => {
+                #[test]
+                fn $short_name() {
+                    let input = $input;
+                    let err = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
+                        .expect_err("some_invalid_field is an invalid field");
+                    dbg!(&err);
+                    assert!(err.to_string().contains("some_invalid_field"));
+                }
+            };
+        }
+        use indoc::indoc;
 
-        let workdir = tempdir_path.join("workdir");
-        fs::create_dir_all(&workdir)?;
+        test!(
+            toplevel,
+            indoc! {r#"
+                some_invalid_field = 23
+            "#}
+        );
 
-        let pg_distrib_dir = tempdir_path.join("pg_distrib");
-        let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}"));
-        fs::create_dir_all(&pg_distrib_dir_versioned)?;
-        let postgres_bin_dir = pg_distrib_dir_versioned.join("bin");
-        fs::create_dir_all(&postgres_bin_dir)?;
-        fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?;
+        test!(
+            toplevel_nested,
+            indoc! {r#"
+                [some_invalid_field]
+                foo = 23
+            "#}
+        );
 
-        Ok((workdir, pg_distrib_dir))
+        test!(
+            disk_usage_based_eviction,
+            indoc! {r#"
+                [disk_usage_based_eviction]
+                some_invalid_field = 23
+            "#}
+        );
+
+        test!(
+            tenant_config,
+            indoc! {r#"
+                [tenant_config]
+                some_invalid_field = 23
+            "#}
+        );
+
+        test!(
+            l0_flush,
+            indoc! {r#"
+                [l0_flush]
+                mode = "direct"
+                some_invalid_field = 23
+            "#}
+        );
+
+        // TODO: fix this => https://github.com/neondatabase/neon/issues/8915
+        // test!(
+        //     remote_storage_config,
+        //     indoc! {r#"
+        //         [remote_storage_config]
+        //         local_path = "/nonexistent"
+        //         some_invalid_field = 23
+        //     "#}
+        // );
+
+        test!(
+            compact_level0_phase1_value_access,
+            indoc! {r#"
+                [compact_level0_phase1_value_access]
+                mode = "streaming-kmerge"
+                some_invalid_field = 23
+            "#}
+        );
     }
 }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 5e4a49bc56..a58fa2c0b1 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -41,19 +41,15 @@
 // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
 //   reading these fields. We use the Debug impl for semi-structured logging, though.
 
-use std::{
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::{sync::Arc, time::SystemTime};
 
 use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
 use remote_storage::GenericRemoteStorage;
-use serde::{Deserialize, Serialize};
+use serde::Serialize;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::serde_percent::Percent;
 use utils::{completion, id::TimelineId};
 
 use crate::{
@@ -69,23 +65,9 @@ use crate::{
     CancellableTask, DiskUsageEvictionTask,
 };
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct DiskUsageEvictionTaskConfig {
-    pub max_usage_pct: Percent,
-    pub min_avail_bytes: u64,
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[cfg(feature = "testing")]
-    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
-    /// Select sorting for evicted layers
-    #[serde(default)]
-    pub eviction_order: EvictionOrder,
-}
-
 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "type", content = "args")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum EvictionOrder {
     /// Order the layers to be evicted by how recently they have been accessed relatively within
     /// the set of resident layers of a tenant.
@@ -96,23 +78,22 @@ pub enum EvictionOrder {
         /// we read tenants is deterministic. If we find the need to use this as `false`, we need
         /// to ensure nondeterminism by adding in a random number to break the
         /// `relative_last_activity==0.0` ties.
-        #[serde(default = "default_highest_layer_count_loses_first")]
         highest_layer_count_loses_first: bool,
     },
 }
 
-impl Default for EvictionOrder {
-    fn default() -> Self {
-        Self::RelativeAccessed {
-            highest_layer_count_loses_first: true,
+impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
+    fn from(value: pageserver_api::config::EvictionOrder) -> Self {
+        match value {
+            pageserver_api::config::EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first,
+            } => Self::RelativeAccessed {
+                highest_layer_count_loses_first,
+            },
         }
     }
 }
 
-fn default_highest_layer_count_loses_first() -> bool {
-    true
-}
-
 impl EvictionOrder {
     fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
         use EvictionOrder::*;
@@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration(
         storage,
         usage_pre,
         tenant_manager,
-        task_config.eviction_order,
+        task_config.eviction_order.into(),
         cancel,
     )
     .await;
@@ -1257,7 +1238,6 @@ mod filesystem_level_usage {
 
     #[test]
     fn max_usage_pct_pressure() {
-        use super::EvictionOrder;
         use super::Usage as _;
         use std::time::Duration;
         use utils::serde_percent::Percent;
@@ -1269,7 +1249,7 @@ mod filesystem_level_usage {
                 period: Duration::MAX,
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
-                eviction_order: EvictionOrder::default(),
+                eviction_order: pageserver_api::config::EvictionOrder::default(),
             },
             total_bytes: 100_000,
             avail_bytes: 0,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 90ae6c5557..d645f3b7b6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2076,7 +2076,7 @@ async fn disk_usage_eviction_run(
         evict_bytes: u64,
 
         #[serde(default)]
-        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
+        eviction_order: pageserver_api::config::EvictionOrder,
     }
 
     #[derive(Debug, Clone, Copy, serde::Serialize)]
@@ -2112,7 +2112,7 @@ async fn disk_usage_eviction_run(
         &state.remote_storage,
         usage,
         &state.tenant_manager,
-        config.eviction_order,
+        config.eviction_order.into(),
         &cancel,
     )
     .await;
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 313a7961a6..491c9fb96c 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,9 +1,7 @@
 use std::{num::NonZeroUsize, sync::Arc};
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub enum L0FlushConfig {
-    #[serde(rename_all = "snake_case")]
     Direct { max_concurrency: NonZeroUsize },
 }
 
@@ -16,6 +14,16 @@ impl Default for L0FlushConfig {
     }
 }
 
+impl From<pageserver_api::models::L0FlushConfig> for L0FlushConfig {
+    fn from(config: pageserver_api::models::L0FlushConfig) -> Self {
+        match config {
+            pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => {
+                Self::Direct { max_concurrency }
+            }
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);
 
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index ede1791afa..5a6f6e5176 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -60,32 +60,7 @@ pub mod mock {
     use regex::Regex;
     use tracing::log::info;
 
-    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[serde(tag = "type")]
-    pub enum Behavior {
-        Success {
-            blocksize: u64,
-            total_blocks: u64,
-            name_filter: Option<utils::serde_regex::Regex>,
-        },
-        Failure {
-            mocked_error: MockedError,
-        },
-    }
-
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-    #[allow(clippy::upper_case_acronyms)]
-    pub enum MockedError {
-        EIO,
-    }
-
-    impl From<MockedError> for nix::Error {
-        fn from(e: MockedError) -> Self {
-            match e {
-                MockedError::EIO => nix::Error::EIO,
-            }
-        }
-    }
+    pub use pageserver_api::config::statvfs::mock::Behavior;
 
     pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
         info!("running mocked statvfs");
@@ -116,6 +91,7 @@ pub mod mock {
                     block_size: *blocksize,
                 })
             }
+            #[cfg(feature = "testing")]
             Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 48ff17db94..7e0344666b 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,11 +9,10 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
+pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::LsnLease;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
@@ -23,50 +22,6 @@ use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
 
-pub mod defaults {
-
-    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
-    // would be more appropriate. But a low value forces the code to be exercised more,
-    // which is good for now to trigger bugs.
-    // This parameter actually determines L0 layer file size.
-    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
-
-    // FIXME the below configs are only used by legacy algorithm. The new algorithm
-    // has different parameters.
-
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
-
-    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
-    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;
-
-    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-
-    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
-    // If there's a need to decrease this value, first make sure that GC
-    // doesn't hold a layer map write lock for non-trivial operations.
-    // Relevant: https://github.com/neondatabase/neon/issues/3394
-    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
-    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
-    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
-    // throughputs up to 1GiB/s per timeline.
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
-    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-    // By default ingest enough WAL for two new L0 layers before checking if new image
-    // image layers should be created.
-    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-}
-
 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
     /// Our generation is current as far as we know, and as far as we know we are the only attached
@@ -281,96 +236,20 @@ impl LocationConf {
     }
 }
 
-/// A tenant's calcuated configuration, which is the result of merging a
-/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
-///
-/// For storing and transmitting individual tenant's configuration, see
-/// TenantConfOpt.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct TenantConf {
-    // Flush out an inmemory layer, if it's holding WAL older than this
-    // This puts a backstop on how much WAL needs to be re-digested if the
-    // page server crashes.
-    // This parameter actually determines L0 layer file size.
-    pub checkpoint_distance: u64,
-    // Inmemory layer is also flushed at least once in checkpoint_timeout to
-    // eventually upload WAL after activity is stopped.
-    #[serde(with = "humantime_serde")]
-    pub checkpoint_timeout: Duration,
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub compaction_target_size: u64,
-    // How often to check if there's compaction work to be done.
-    // Duration::ZERO means automatic compaction is disabled.
-    #[serde(with = "humantime_serde")]
-    pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
-    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithmSettings,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is #of bytes of WAL.
-    // Page versions older than this are garbage collected away.
-    pub gc_horizon: u64,
-    // Interval at which garbage collection is triggered.
-    // Duration::ZERO means automatic GC is disabled
-    #[serde(with = "humantime_serde")]
-    pub gc_period: Duration,
-    // Delta layer churn threshold to create L1 image layers.
-    pub image_creation_threshold: usize,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is time.
-    // Page versions older than this are garbage collected away.
-    #[serde(with = "humantime_serde")]
-    pub pitr_interval: Duration,
-    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
-    #[serde(with = "humantime_serde")]
-    pub walreceiver_connect_timeout: Duration,
-    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
-    /// A stalled safekeeper will be changed to a newer one when it appears.
-    #[serde(with = "humantime_serde")]
-    pub lagging_wal_timeout: Duration,
-    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
-    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
-    /// to avoid eager reconnects.
-    pub max_lsn_wal_lag: NonZeroU64,
-    pub eviction_policy: EvictionPolicy,
-    pub min_resident_size_override: Option<u64>,
-    // See the corresponding metric's help string.
-    #[serde(with = "humantime_serde")]
-    pub evictions_low_residence_duration_metric_threshold: Duration,
-
-    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
-    /// may be disabled if a Tenant will not have secondary locations: only secondary
-    /// locations will use the heatmap uploaded by attached locations.
-    #[serde(with = "humantime_serde")]
-    pub heatmap_period: Duration,
-
-    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
-    pub lazy_slru_download: bool,
-
-    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
-
-    // How much WAL must be ingested before checking again whether a new image layer is required.
-    // Expresed in multiples of checkpoint distance.
-    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: AuxFilePolicy,
-
-    /// The length for an explicit LSN lease request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length: Duration,
-
-    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length_for_ts: Duration,
+impl Default for LocationConf {
+    // TODO: this should be removed once tenant loading can guarantee that we are never
+    // loading from a directory without a configuration.
+    // => tech debt since https://github.com/neondatabase/neon/issues/1555
+    fn default() -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation: Generation::none(),
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf: TenantConfOpt::default(),
+            shard: ShardIdentity::unsharded(),
+        }
+    }
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -545,51 +424,6 @@ impl TenantConfOpt {
     }
 }
 
-impl Default for TenantConf {
-    fn default() -> Self {
-        use defaults::*;
-        Self {
-            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
-                .expect("cannot parse default checkpoint timeout"),
-            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
-            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
-                .expect("cannot parse default compaction period"),
-            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
-            },
-            gc_horizon: DEFAULT_GC_HORIZON,
-            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
-                .expect("cannot parse default gc period"),
-            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
-                .expect("cannot parse default PITR interval"),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .expect("cannot parse default walreceiver connect timeout"),
-            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
-                .expect("cannot parse default walreceiver lagging wal timeout"),
-            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .expect("cannot parse default max walreceiver Lsn wal lag"),
-            eviction_policy: EvictionPolicy::NoEviction,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
-                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
-            )
-            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            heatmap_period: Duration::ZERO,
-            lazy_slru_download: false,
-            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
-            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
-            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
-            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
-        }
-    }
-}
-
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
     type Error = anyhow::Error;
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b8e9a98149..6a2cd94232 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
@@ -52,6 +52,7 @@ use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 4a095c564d..77ce1ae670 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -34,8 +34,7 @@ use crate::tenant::disk_btree::{
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadPlanner,
+    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
@@ -46,6 +45,7 @@ use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
+use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 2c19e5b19f..e487bee1f2 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -215,7 +215,7 @@ impl IndexEntry {
 
     const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
         let res = Self::validate_checkpoint_distance(
-            crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
         );
         if res.is_err() {
             panic!("default checkpoint distance is valid")
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index f5680ced90..478e9bb4f0 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
@@ -456,9 +455,11 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
 
             // If compaction period is set to zero (to disable it), then we will use a reasonable default
             let period = if period == Duration::ZERO {
-                humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
-                    .unwrap()
-                    .into()
+                humantime::Duration::from_str(
+                    pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
+                )
+                .unwrap()
+                .into()
             } else {
                 period
             };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3b8f19a6c0..262dccac7d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,7 +66,6 @@ use std::{
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
-        config::defaults::DEFAULT_PITR_INTERVAL,
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
         storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
@@ -102,6 +101,7 @@ use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index aad75ac59c..6b9c8386f7 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,6 +19,7 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -29,7 +30,6 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
@@ -43,6 +43,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use pageserver_api::config::tenant_conf_defaults::{
+    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
+};
 
 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
@@ -1433,43 +1436,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
     }
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 impl Timeline {
     /// Entry point for new tiered compaction algorithm.
     ///
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 146bcf0e35..4d51dc442d 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -16,7 +16,6 @@
 //! Note that the vectored blob api does *not* go through the page cache.
 
 use std::collections::BTreeMap;
-use std::num::NonZeroUsize;
 
 use bytes::BytesMut;
 use pageserver_api::key::Key;
@@ -29,9 +28,6 @@ use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::{self, VirtualFile};
 
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub struct MaxVectoredReadBytes(pub NonZeroUsize);
-
 /// Metadata bundled with the start and end offset of a blob.
 #[derive(Copy, Clone, Debug)]
 pub struct BlobMeta {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 97d966e2da..ed6ff86c10 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,7 +10,6 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
@@ -19,6 +18,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::io_buf_ext::FullSlice;
+use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index faef1ba9ff..ccde90ee1a 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine {
                         }
                     },
                     Err(std::env::VarError::NotPresent) => {
-                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                            .parse()
-                            .unwrap()
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoEngineKind::TokioEpollUring
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoEngineKind::StdFs
+                        }
                     }
                     Err(std::env::VarError::NotUnicode(_)) => {
                         panic!("env var {env_var_name} is not unicode");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 890538b86a..2df45a7e0e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -24,7 +24,20 @@ from functools import cached_property, partial
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 from urllib.parse import quote, urlparse
 
 import asyncpg
@@ -90,6 +103,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
 from .neon_api import NeonAPI, NeonApiEndpoint
 
+T = TypeVar("T")
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -2986,16 +3001,17 @@ class NeonPageserver(PgProtocol, LogUtils):
     def config_toml_path(self) -> Path:
         return self.workdir / "pageserver.toml"
 
-    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
+    def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T:
         """
         Edit the pageserver's config toml file in place.
         """
         path = self.config_toml_path
         with open(path, "r") as f:
             config = toml.load(f)
-        edit_fn(config)
+        res = edit_fn(config)
         with open(path, "w") as f:
             toml.dump(config, f)
+        return res
 
     def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 73af7950f1..ebf58d2bd1 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     # We will start a pageserver with no control_plane_api set, so it won't be able to self-register
     env.storage_controller.node_register(env.pageserver)
 
-    replaced_config = env.pageserver.patch_config_toml_nonrecursive(
-        {
-            "control_plane_api": "",
-        }
-    )
+    def remove_control_plane_api_field(config):
+        return config.pop("control_plane_api")
+
+    control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field)
     env.pageserver.start()
     env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
 
@@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.stop()
     # Starting without the override that disabled control_plane_api
-    env.pageserver.patch_config_toml_nonrecursive(replaced_config)
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "control_plane_api": control_plane_api,
+        }
+    )
     env.pageserver.start()
 
     generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 642b9e449b..9bf5f8680b 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     # We will run with the limit set to 1, so that once we have one tenant stuck
     # in a pausable failpoint, the rest are prevented from proceeding through warmup.
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
@@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
 
 
 def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
 
@@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
 @pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
 def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
     # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
+    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
 
     env = neon_env_builder.init_start()
 

From efe03d5a1ccce8e0f53e733d61fd0e3d0dd904f8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 5 Sep 2024 16:29:48 +0300
Subject: [PATCH 016/142] build: sync between benchies (#8919)

Sometimes, the benchmarks fail to start up pageserver in 10s without any
obvious reason. Benchmarks run sequentially on otherwise idle runners.
Try running `sync(2)` after each bench to force a cleaner slate.

Implement this via:
- SYNC_AFTER_EACH_TEST environment variable enabled autouse fixture
- autouse fixture seems to be outermost fixture, so it works as expected
- set SYNC_AFTER_EACH_TEST=true for benchmarks in build_and_test
workflow

Evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10678984691/index.html#suites/5008d72a1ba3c0d618a030a938fc035c/1210266507534c0f/

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml     |  1 +
 test_runner/fixtures/compare_fixtures.py | 26 ++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 53d33b420f..ee5fd1b0c6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -286,6 +286,7 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          SYNC_AFTER_EACH_TEST: true
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 98a9dd7184..7c4a8db36f 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -1,3 +1,5 @@
+import os
+import time
 from abc import ABC, abstractmethod
 from contextlib import _GeneratorContextManager, contextmanager
 
@@ -8,6 +10,7 @@ import pytest
 from _pytest.fixtures import FixtureRequest
 
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     PgBin,
@@ -333,3 +336,26 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare:
     fixture = request.getfixturevalue(request.param)
     assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare"
     return fixture
+
+
+@pytest.fixture(scope="function", autouse=True)
+def sync_after_each_test():
+    # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true`
+    #
+    # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`)
+    # that are run on self-hosted runners because some of these tests are pretty write-heavy
+    # and create issues to start the processes within 10s
+    key = "SYNC_AFTER_EACH_TEST"
+    enabled = os.environ.get(key) == "true"
+
+    yield
+
+    if not enabled:
+        # regress test, or running locally
+        return
+
+    start = time.time()
+    # we only run benches on unices, the method might not exist on windows
+    os.sync()
+    elapsed = time.time() - start
+    log.info(f"called sync after test {elapsed=}")

From ebddda5b7f85587998df00dbf7dc88679459b494 Mon Sep 17 00:00:00 2001
From: vladov <vladov3000@gmail.com>
Date: Thu, 5 Sep 2024 08:06:57 -0700
Subject: [PATCH 017/142] Fix precedence issue causing yielding loop to never
 yield. (#8922)

There is a bug in `yielding_loop` that causes it to never yield.

## Summary of changes

Fixed the bug. `i + 1 % interval == 0` will always evaluate to `i + 1 ==
0` which is false
([Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=68e6ca393a02113cb7720115c2842e75)).
This function is called in 2 places
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/scheduler.rs#L389)
and
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/heatmap_uploader.rs#L152)
with `interval == 1000` in both cases.

This may change the performance of the system since now we are yielding
to tokio. Also, this may expose undefined behavior since it is now
possible for tasks to be moved between threads/whatever tokio does to
tasks. However, this was the intention of the author of the code.
---
 libs/utils/src/yielding_loop.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 963279eb4c..41c4cee45d 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -23,7 +23,7 @@ where
     for (i, item) in iter.enumerate() {
         visitor(item);
 
-        if i + 1 % interval == 0 {
+        if (i + 1) % interval == 0 {
             tokio::task::yield_now().await;
             if cancel.is_cancelled() {
                 return Err(YieldingLoopError::Cancelled);

From fd12dd942f61a0a22016fa219f4b3a87c81dc0b0 Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Thu, 5 Sep 2024 17:48:51 +0200
Subject: [PATCH 018/142] Add installation instructions for m4 on mac (#8929)

## Problem
Building on MacOS failed due to missing m4. Although a window was
popping up claiming to install m4, this was not helping.

## Summary of changes
Add instructions to install m4 using brew and link it (thanks to Folke
for helping).
---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 735edef0fc..b54956f773 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```
 
+If you get errors about missing `m4` you may have to install it manually:
+```
+brew install m4
+brew link --force m4
+```
+
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
 # recommended approach from https://www.rust-lang.org/tools/install

From 04f99a87bfee4da41df2bd5724e73b3646c2bf3e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 5 Sep 2024 19:14:21 +0100
Subject: [PATCH 019/142] storcon: make pageserver AZ id mandatory (#8856)

## Problem
https://github.com/neondatabase/neon/pull/8852 introduced a new nullable
column for the `nodes` table: `availability_zone_id`

## Summary of changes
* Make neon local and the test suite always provide an az id
* Make the az id field in the ps registration request mandatory
* Migrate the column to non-nullable and adjust in memory state
accordingly
* Remove the code that was used to populate the az id for pre-existing
nodes
---
 Dockerfile                                    |  1 +
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/pageserver_api/src/controller_api.rs     |  2 +-
 pageserver/src/control_plane_client.rs        | 24 +++++++---
 .../down.sql                                  |  1 +
 .../up.sql                                    |  1 +
 storage_controller/src/node.rs                | 23 +++-------
 storage_controller/src/persistence.rs         | 28 +-----------
 storage_controller/src/scheduler.rs           |  2 +-
 storage_controller/src/schema.rs              |  2 +-
 storage_controller/src/service.rs             | 44 ++-----------------
 test_runner/fixtures/neon_fixtures.py         |  3 ++
 12 files changed, 41 insertions(+), 92 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql

diff --git a/Dockerfile b/Dockerfile
index d3d12330c6..1efedfa9bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,6 +87,7 @@ RUN mkdir -p /data/.neon/ && \
        "pg_distrib_dir='/usr/local/'\n" \
        "listen_pg_addr='0.0.0.0:6400'\n" \
        "listen_http_addr='0.0.0.0:9898'\n" \
+       "availability_zone='local'\n" \
   > /data/.neon/pageserver.toml && \
   chown -R neon:neon /data/.neon
 
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 5cce6cf3ae..2a81a3d825 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -336,7 +336,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
-                        availability_zone_id: Some(availability_zone_id),
+                        availability_zone_id,
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 345abd69b6..6fb5a9a139 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -57,7 +57,7 @@ pub struct NodeRegisterRequest {
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
-    pub availability_zone_id: Option<String>,
+    pub availability_zone_id: String,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 56a536c387..f6d1c35a8c 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,10 +141,24 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         m.other
                     );
 
-                    let az_id = m
-                        .other
-                        .get("availability_zone_id")
-                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+                    let az_id = {
+                        let az_id_from_metadata = m
+                            .other
+                            .get("availability_zone_id")
+                            .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+
+                        match az_id_from_metadata {
+                            Some(az_id) => Some(az_id),
+                            None => {
+                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
+                                conf.availability_zone.clone()
+                            }
+                        }
+                    };
+
+                    if az_id.is_none() {
+                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
+                    }
 
                     Some(NodeRegisterRequest {
                         node_id: conf.id,
@@ -152,7 +166,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
-                        availability_zone_id: az_id,
+                        availability_zone_id: az_id.expect("Checked above"),
                     })
                 }
                 Err(e) => {
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
new file mode 100644
index 0000000000..4fcb928533
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL;
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
new file mode 100644
index 0000000000..c5b4534087
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 73cecc491d..cb9ce10d23 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -36,7 +36,7 @@ pub(crate) struct Node {
     listen_pg_addr: String,
     listen_pg_port: u16,
 
-    availability_zone_id: Option<String>,
+    availability_zone_id: String,
 
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
@@ -63,8 +63,9 @@ impl Node {
         self.id
     }
 
-    pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
-        self.availability_zone_id.as_deref()
+    #[allow(unused)]
+    pub(crate) fn get_availability_zone_id(&self) -> &str {
+        self.availability_zone_id.as_str()
     }
 
     pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
@@ -78,22 +79,12 @@ impl Node {
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
-        let az_ids_match = {
-            match (
-                self.availability_zone_id.as_deref(),
-                register_req.availability_zone_id.as_deref(),
-            ) {
-                (Some(current_az), Some(register_req_az)) => current_az == register_req_az,
-                _ => true,
-            }
-        };
-
-        az_ids_match
-            && self.id == register_req.node_id
+        self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
             && self.listen_pg_addr == register_req.listen_pg_addr
             && self.listen_pg_port == register_req.listen_pg_port
+            && self.availability_zone_id == register_req.availability_zone_id
     }
 
     /// For a shard located on this node, populate a response object
@@ -190,7 +181,7 @@ impl Node {
         listen_http_port: u16,
         listen_pg_addr: String,
         listen_pg_port: u16,
-        availability_zone_id: Option<String>,
+        availability_zone_id: String,
     ) -> Self {
         Self {
             id,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index e801289752..6df05ebd13 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,7 +105,6 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
-    SetNodeAzId,
 }
 
 #[must_use]
@@ -325,31 +324,6 @@ impl Persistence {
         }
     }
 
-    pub(crate) async fn set_node_availability_zone_id(
-        &self,
-        input_node_id: NodeId,
-        input_az_id: String,
-    ) -> DatabaseResult<()> {
-        use crate::schema::nodes::dsl::*;
-        let updated = self
-            .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((availability_zone_id.eq(input_az_id.clone()),))
-                    .execute(conn)?;
-                Ok(updated)
-            })
-            .await?;
-
-        if updated != 1 {
-            Err(DatabaseError::Logical(format!(
-                "Node {node_id:?} not found for setting az id",
-            )))
-        } else {
-            Ok(())
-        }
-    }
-
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -1110,7 +1084,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_http_port: i32,
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
-    pub(crate) availability_zone_id: Option<String>,
+    pub(crate) availability_zone_id: String,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index ef4da6861c..deb5f27226 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -528,7 +528,7 @@ pub(crate) mod test_utils {
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
-                        None,
+                        "test-az".to_string(),
                     );
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index e0f515daea..93ab774b5f 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -25,7 +25,7 @@ diesel::table! {
         listen_http_port -> Int4,
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
-        availability_zone_id -> Nullable<Varchar>,
+        availability_zone_id -> Varchar,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ca416095bb..2911cd5ac4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1264,7 +1264,7 @@ impl Service {
                     123,
                     "".to_string(),
                     123,
-                    None,
+                    "test_az".to_string(),
                 );
 
                 scheduler.node_upsert(&node);
@@ -4825,15 +4825,8 @@ impl Service {
         )
         .await;
 
-        if register_req.availability_zone_id.is_none() {
-            tracing::warn!(
-                "Node {} registering without specific availability zone id",
-                register_req.node_id
-            );
-        }
-
         enum RegistrationStatus {
-            Matched(Node),
+            Matched,
             Mismatched,
             New,
         }
@@ -4842,7 +4835,7 @@ impl Service {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 if node.registration_match(&register_req) {
-                    RegistrationStatus::Matched(node.clone())
+                    RegistrationStatus::Matched
                 } else {
                     RegistrationStatus::Mismatched
                 }
@@ -4852,41 +4845,12 @@ impl Service {
         };
 
         match registration_status {
-            RegistrationStatus::Matched(node) => {
+            RegistrationStatus::Matched => {
                 tracing::info!(
                     "Node {} re-registered with matching address",
                     register_req.node_id
                 );
 
-                if node.get_availability_zone_id().is_none() {
-                    if let Some(az_id) = register_req.availability_zone_id.clone() {
-                        tracing::info!("Extracting availability zone id from registration request for node {}: {}",
-                                       register_req.node_id, az_id);
-
-                        // Persist to the database and update in memory state. See comment below
-                        // on ordering.
-                        self.persistence
-                            .set_node_availability_zone_id(register_req.node_id, az_id)
-                            .await?;
-                        let node_with_az = Node::new(
-                            register_req.node_id,
-                            register_req.listen_http_addr,
-                            register_req.listen_http_port,
-                            register_req.listen_pg_addr,
-                            register_req.listen_pg_port,
-                            register_req.availability_zone_id,
-                        );
-
-                        let mut locked = self.inner.write().unwrap();
-                        let mut new_nodes = (*locked.nodes).clone();
-
-                        locked.scheduler.node_upsert(&node_with_az);
-                        new_nodes.insert(register_req.node_id, node_with_az);
-
-                        locked.nodes = Arc::new(new_nodes);
-                    }
-                }
-
                 return Ok(());
             }
             RegistrationStatus::Mismatched => {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2df45a7e0e..0c692ceb69 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -758,6 +758,9 @@ class NeonEnvBuilder:
         patch_script = ""
         for ps in self.env.pageservers:
             patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+            # This is a temporary to get the backward compat test happy
+            # since the compat snapshot was generated with an older version of neon local
+            patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}'  WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;"
         patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines

From cf11c8ab6aa234b59354425116da98d58fa1826d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:52:29 +0200
Subject: [PATCH 020/142] update svg_fmt to 0.4.3 (#8930)

Audited

```
diff -r -u ~/.cargo/registry/src/index.crates.io-6f17d22bba15001f/svg_fmt-0.4.{2,3}
```

fixes https://github.com/neondatabase/neon/issues/7763
---
 Cargo.lock | 5 +++--
 Cargo.toml | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 91917d5351..3f2787f15b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6094,8 +6094,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
-version = "0.4.2"
-source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 4fea3e8d80..2415337110 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -161,8 +161,7 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
-svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
+svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"

From 06e840b884c242550e2a5ad0e72bfa762bce1709 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:58:48 +0200
Subject: [PATCH 021/142] compact_level0_phase1: ignore access mode config,
 always do streaming-kmerge without validation (#8934)

refs https://github.com/neondatabase/neon/issues/8184

PR https://github.com/neondatabase/infra/pull/1905 enabled
streaming-kmerge without validation everywhere.

It rolls into prod sooner or in the same release as this PR.
---
 libs/pageserver_api/src/config.rs            |  43 +-----
 pageserver/src/bin/pageserver.rs             |   1 -
 pageserver/src/config.rs                     |  26 ++--
 pageserver/src/tenant/timeline/compaction.rs | 139 +------------------
 4 files changed, 22 insertions(+), 187 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index b2662c562a..1194ee93ef 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,7 +104,9 @@ pub struct ConfigToml {
     pub image_compression: ImageCompressionAlgorithm,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    #[serde(skip_serializing)]
+    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
+    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
     pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
     pub io_buffer_alignment: usize,
 }
@@ -209,43 +211,6 @@ pub enum GetImpl {
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 /// A tenant's calcuated configuration, which is the result of merging a
 /// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
 ///
@@ -452,7 +417,7 @@ impl Default for ConfigToml {
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            compact_level0_phase1_value_access: Default::default(),
             virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
 
             io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2c60e8d7d1..59194ab4bd 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,7 +125,6 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
     info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
 
     // The tenants directory contains all the pageserver local disk state.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c159b66905..4e68e276d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,10 +174,6 @@ pub struct PageServerConf {
 
     pub l0_flush: crate::l0_flush::L0FlushConfig,
 
-    /// This flag is temporary and will be removed after gradual rollout.
-    /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
-
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
 
@@ -338,7 +334,7 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
+            compact_level0_phase1_value_access: _,
             l0_flush,
             virtual_file_direct_io,
             concurrent_tenant_warmup,
@@ -383,7 +379,6 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
             virtual_file_direct_io,
             io_buffer_alignment,
 
@@ -561,6 +556,16 @@ mod tests {
             .expect("parse_and_validate");
     }
 
+    #[test]
+    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
+        let input = indoc::indoc! {r#"
+            [compact_level0_phase1_value_access]
+            mode = "streaming-kmerge"
+            validate = "key-lsn-value"
+        "#};
+        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
+    }
+
     /// If there's a typo in the pageserver config, we'd rather catch that typo
     /// and fail pageserver startup than silently ignoring the typo, leaving whoever
     /// made it in the believe that their config change is effective.
@@ -637,14 +642,5 @@ mod tests {
         //         some_invalid_field = 23
         //     "#}
         // );
-
-        test!(
-            compact_level0_phase1_value_access,
-            indoc! {r#"
-                [compact_level0_phase1_value_access]
-                mode = "streaming-kmerge"
-                some_invalid_field = 23
-            "#}
-        );
     }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6b9c8386f7..a87b502cd6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,7 +19,6 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -912,137 +911,13 @@ impl Timeline {
         // we're compacting, in key, LSN order.
         // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
         // then the Value::Image is ordered before Value::WalRecord.
-        //
-        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
-        // option and validation code once we've reached confidence.
-        enum AllValuesIter<'a> {
-            PageCachedBlobIo {
-                all_keys_iter: VecIter<'a>,
-            },
-            StreamingKmergeBypassingPageCache {
-                merge_iter: MergeIterator<'a>,
-            },
-            ValidatingStreamingKmergeBypassingPageCache {
-                mode: CompactL0BypassPageCacheValidation,
-                merge_iter: MergeIterator<'a>,
-                all_keys_iter: VecIter<'a>,
-            },
-        }
-        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
-        impl AllValuesIter<'_> {
-            async fn next_all_keys_iter(
-                iter: &mut VecIter<'_>,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                let Some(DeltaEntry {
-                    key,
-                    lsn,
-                    val: value_ref,
-                    ..
-                }) = iter.next()
-                else {
-                    return Ok(None);
-                };
-                let value = value_ref.load(ctx).await?;
-                Ok(Some((*key, *lsn, value)))
-            }
-            async fn next(
-                &mut self,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                match self {
-                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
-                      Self::next_all_keys_iter(iter, ctx).await
-                    }
-                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
-                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
-                        // advance both iterators
-                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
-                        let merge_iter_item = merge_iter.next().await;
-                        // compare results & log warnings as needed
-                        macro_rules! rate_limited_warn {
-                            ($($arg:tt)*) => {{
-                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
-                                    warn!($($arg)*);
-                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
-                                }
-                                use once_cell::sync::Lazy;
-                                use utils::rate_limit::RateLimit;
-                                use std::sync::Mutex;
-                                use std::time::Duration;
-                                static LOGGED: Lazy<Mutex<RateLimit>> =
-                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                                let mut rate_limit = LOGGED.lock().unwrap();
-                                rate_limit.call(|| {
-                                    warn!($($arg)*);
-                                });
-                            }}
-                        }
-                        match (&all_keys_iter_item, &merge_iter_item) {
-                            (Err(_), Err(_)) => {
-                                // don't bother asserting equivality of the errors
-                            }
-                            (Err(all_keys), Ok(merge)) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
-                            },
-                            (Ok(all_keys), Err(merge)) => {
-                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
-                            },
-                            (Ok(None), Ok(None)) => { }
-                            (Ok(Some(all_keys)), Ok(None)) => {
-                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
-                            }
-                            (Ok(None), Ok(Some(merge))) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
-                            }
-                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
-                                match mode {
-                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
-                                    CompactL0BypassPageCacheValidation::KeyLsn => {
-                                        let all_keys = (all_keys_key, all_keys_lsn);
-                                        let merge = (merge_key, merge_lsn);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
-                                        }
-                                    }
-                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
-                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
-                                        let merge = (merge_key, merge_lsn, merge_value);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        // in case of mismatch, trust the legacy all_keys_iter_item
-                        all_keys_iter_item
-                    }.instrument(info_span!("next")).await
-                }
-            }
-        }
-        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
-            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
-                all_keys_iter: all_keys.iter(),
-            },
-            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
-                let merge_iter = {
-                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
-                    for l in deltas_to_compact.iter() {
-                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                        deltas.push(l);
-                    }
-                    MergeIterator::create(&deltas, &[], ctx)
-                };
-                match validate {
-                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
-                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
-                        mode: validate.clone(),
-                        merge_iter,
-                        all_keys_iter: all_keys.iter(),
-                    },
-                }
+        let mut all_values_iter = {
+            let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+            for l in deltas_to_compact.iter() {
+                let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                deltas.push(l);
             }
+            MergeIterator::create(&deltas, &[], ctx)
         };
 
         // This iterator walks through all keys and is needed to calculate size used by each key
@@ -1119,7 +994,7 @@ impl Timeline {
         let mut keys = 0;
 
         while let Some((key, lsn, value)) = all_values_iter
-            .next(ctx)
+            .next()
             .await
             .map_err(CompactionError::Other)?
         {

From a1323231bc65539f55eb1bfd341fb65d06d0ed22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 12:40:19 +0200
Subject: [PATCH 022/142] Update Rust to 1.81.0 (#8939)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1810-2024-09-05).

Prior update was in #8667 and #8518
---
 Cargo.lock                                    | 30 +++++--------------
 Cargo.toml                                    |  2 +-
 Dockerfile.build-tools                        |  2 +-
 libs/postgres_ffi/build.rs                    |  2 +-
 libs/walproposer/build.rs                     | 21 ++++++++++---
 libs/walproposer/src/api_bindings.rs          | 10 +++----
 .../tenant/remote_timeline_client/download.rs |  3 +-
 proxy/src/console/provider/neon.rs            |  5 +---
 rust-toolchain.toml                           |  2 +-
 safekeeper/src/send_wal.rs                    |  5 ++--
 workspace_hack/Cargo.toml                     |  6 ++--
 11 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f2787f15b..634af67198 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -915,25 +915,22 @@ dependencies = [
 
 [[package]]
 name = "bindgen"
-version = "0.65.1"
+version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
+checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "lazy_static",
- "lazycell",
+ "itertools 0.12.1",
  "log",
- "peeking_take_while",
- "prettyplease 0.2.6",
+ "prettyplease 0.2.17",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
  "syn 2.0.52",
- "which",
 ]
 
 [[package]]
@@ -2949,12 +2946,6 @@ dependencies = [
  "spin 0.5.2",
 ]
 
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3977,12 +3968,6 @@ dependencies = [
  "sha2",
 ]
 
-[[package]]
-name = "peeking_take_while"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
-
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -4280,9 +4265,9 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.6"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
+checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
  "syn 2.0.52",
@@ -7628,6 +7613,7 @@ dependencies = [
  "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools 0.10.5",
+ "itertools 0.12.1",
  "lazy_static",
  "libc",
  "log",
diff --git a/Cargo.toml b/Cargo.toml
index 2415337110..5045ee0d4d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,7 @@ aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.65"
+bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d6beb61369..a9cbed85fb 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.1
+ENV RUSTC_VERSION=1.81.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index 370d9e9a6f..d3e3ce648f 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
     fn include_file(&self, filename: &str) {
         // This does the equivalent of passing bindgen::CargoCallbacks
         // to the builder .parse_callbacks() method.
-        let cargo_callbacks = bindgen::CargoCallbacks;
+        let cargo_callbacks = bindgen::CargoCallbacks::new();
         cargo_callbacks.include_file(filename)
     }
 
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 7bb077062b..28547f52bf 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -4,7 +4,6 @@
 use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
-use bindgen::CargoCallbacks;
 
 fn main() -> anyhow::Result<()> {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
@@ -64,16 +63,25 @@ fn main() -> anyhow::Result<()> {
             .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
     };
 
+    let unwind_abi_functions = [
+        "log_internal",
+        "recovery_download",
+        "start_streaming",
+        "finish_sync_safekeepers",
+        "wait_event_set",
+        "WalProposerStart",
+    ];
+
     // The bindgen::Builder is the main entry point
     // to bindgen, and lets you build up options for
     // the resulting bindings.
-    let bindings = bindgen::Builder::default()
+    let mut builder = bindgen::Builder::default()
         // The input header we would like to generate
         // bindings for.
         .header("bindgen_deps.h")
         // Tell cargo to invalidate the built crate whenever any of the
         // included header files changed.
-        .parse_callbacks(Box::new(CargoCallbacks))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .allowlist_type("WalProposer")
         .allowlist_type("WalProposerConfig")
         .allowlist_type("walproposer_api")
@@ -105,7 +113,12 @@ fn main() -> anyhow::Result<()> {
         .allowlist_var("WL_SOCKET_MASK")
         .clang_arg("-DWALPROPOSER_LIB")
         .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"))
+        .clang_arg(format!("-I{inc_server_path}"));
+
+    for name in unwind_abi_functions {
+        builder = builder.override_abi(bindgen::Abi::CUnwind, name);
+    }
+    let bindings = builder
         // Finish the builder and generate the bindings.
         .generate()
         // Unwrap the Result and panic on failure.
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index bbc3663402..2fbea3fe45 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
     }
 }
 
-extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
     }
 }
 
-extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
     }
 }
 
-extern "C" fn wait_event_set(
+extern "C-unwind" fn wait_event_set(
     wp: *mut WalProposer,
     timeout: ::std::os::raw::c_long,
     event_sk: *mut *mut Safekeeper,
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
     }
 }
 
-extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
     }
 }
 
-extern "C" fn log_internal(
+extern "C-unwind" fn log_internal(
     wp: *mut WalProposer,
     level: ::std::os::raw::c_int,
     line: *const ::std::os::raw::c_char,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d9725ad756..9fbe2f0da5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst(
         cancel,
     )
     .await
-    .map_err(|e| {
+    .inspect_err(|_e| {
         // Do a best-effort attempt at deleting the temporary file upon encountering an error.
         // We don't have async here nor do we want to pile on any extra errors.
         if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst(
                 warn!("error deleting temporary file {temp_path}: {e}");
             }
         }
-        e
     })?;
 
     Ok((temp_path, file))
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 33eda72e65..b004bf4ecf 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,10 +38,7 @@ impl Api {
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
-            Ok(v) => v,
-            Err(_) => String::new(),
-        };
+        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
         Self {
             endpoint,
             caches,
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 368b8d300a..e78c4d6790 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.1"
+channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 90b1604adb..6d677f405a 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -758,9 +758,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
                 // pq_sendint32(&reply_message, xmin);
                 // pq_sendint32(&reply_message, xmin_epoch);
                 // So it is two big endian 32-bit words in low endian order!
-                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
-                hs_feedback.catalog_xmin =
-                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
+                hs_feedback.xmin = hs_feedback.xmin.rotate_left(32);
+                hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32);
                 self.ws_guard
                     .walsenders
                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 20693ad63d..3d2fa8c214 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -47,7 +47,8 @@ hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -101,7 +102,8 @@ either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }

From e86fef05ddbc276170ec29d035d86d03e3ad4ec2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Sep 2024 13:11:17 +0100
Subject: [PATCH 023/142] storcon: track preferred AZ for each tenant shard
 (#8937)

## Problem
We want to do AZ aware scheduling, but don't have enough metadata.

## Summary of changes
Introduce a `preferred_az_id` concept for each managed tenant shard.

In a future PR, the scheduler will use this as a soft preference.
The idea is to try and keep the shard attachments within the same AZ.
Under the assumption that the compute was placed in the correct AZ,
this reduces the chances of cross AZ trafic from between compute and PS.

In terms of code changes we:
1. Add a new nullable `preferred_az_id` column to the `tenant_shards`
table. Also include an in-memory counterpart.
2. Populate the preferred az on tenant creation and shard splits.
3. Add an endpoint which allows to bulk-set preferred AZs.

(3) gives us the migration path. I'll write a script which queries the
cplane db in the region and sets the preferred az of all shards with an
active compute to the AZ of said compute. For shards without an active compute,
I'll use the AZ of the currently attached pageserver
since this is what cplane uses now to schedule computes.
---
 libs/pageserver_api/src/controller_api.rs     |  15 +-
 .../down.sql                                  |   1 +
 .../up.sql                                    |   1 +
 storage_controller/src/http.rs                |  21 +-
 storage_controller/src/persistence.rs         |  33 ++
 storage_controller/src/schema.rs              |   1 +
 storage_controller/src/service.rs             | 327 +++++++++++++-----
 storage_controller/src/tenant_shard.rs        |  15 +
 test_runner/fixtures/neon_fixtures.py         |  13 +-
 .../regress/test_storage_controller.py        |  52 +++
 10 files changed, 384 insertions(+), 95 deletions(-)
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 6fb5a9a139..94104af002 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -74,6 +74,17 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsRequest {
+    #[serde(flatten)]
+    pub preferred_az_ids: HashMap<TenantShardId, String>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsResponse {
+    pub updated: Vec<TenantShardId>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
     pub shard_id: TenantShardId,
@@ -132,6 +143,8 @@ pub struct TenantDescribeResponseShard {
     pub is_splitting: bool,
 
     pub scheduling_policy: ShardSchedulingPolicy,
+
+    pub preferred_az_id: Option<String>,
 }
 
 /// Explicitly migrating a particular shard is a low level operation
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
new file mode 100644
index 0000000000..127972a2e4
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards DROP preferred_az_id;
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
new file mode 100644
index 0000000000..641a54feb2
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 32882c201a..5d4d0460be 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -14,7 +14,7 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
     MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    TenantCreateRequest,
+    ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
@@ -688,6 +688,18 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
+async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state.service.update_shards_preferred_azs(azs_req).await?,
+    )
+}
+
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1174,6 +1186,13 @@ pub fn make_router(
                 RequestName("control_v1_tenant_policy"),
             )
         })
+        .put("/control/v1/preferred_azs", |r| {
+            named_request_span(
+                r,
+                handle_update_preferred_azs,
+                RequestName("control_v1_preferred_azs"),
+            )
+        })
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 6df05ebd13..1dc1040d96 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,6 +105,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
+    SetPreferredAzs,
 }
 
 #[must_use]
@@ -664,6 +665,33 @@ impl Persistence {
         Ok(())
     }
 
+    pub(crate) async fn set_tenant_shard_preferred_azs(
+        &self,
+        preferred_azs: Vec<(TenantShardId, String)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, String)>> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
+            let mut shards_updated = Vec::default();
+
+            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set(preferred_az_id.eq(preferred_az))
+                    .execute(conn)?;
+
+                if updated == 1 {
+                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                }
+            }
+
+            Ok(shards_updated)
+        })
+        .await
+    }
+
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
@@ -1050,6 +1078,11 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) config: String,
     #[serde(default)]
     pub(crate) scheduling_policy: String,
+
+    // Hint that we should attempt to schedule this tenant shard the given
+    // availability zone in order to minimise the chances of cross-AZ communication
+    // with compute.
+    pub(crate) preferred_az_id: Option<String>,
 }
 
 impl TenantShardPersistence {
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 93ab774b5f..1717a9369d 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -41,6 +41,7 @@ diesel::table! {
         splitting -> Int2,
         config -> Text,
         scheduling_policy -> Varchar,
+        preferred_az_id -> Nullable<Varchar>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2911cd5ac4..324f864291 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -25,7 +25,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
         ScheduleOptimizationAction,
@@ -41,10 +41,11 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -353,6 +354,12 @@ impl From<DatabaseError> for ApiError {
     }
 }
 
+enum InitialShardScheduleOutcome {
+    Scheduled(TenantCreateResponseShard),
+    NotScheduled,
+    ShardScheduleError(ScheduleError),
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
@@ -1452,6 +1459,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             };
 
             match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -2023,6 +2031,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             })
             .collect();
 
@@ -2046,99 +2055,87 @@ impl Service {
         };
 
         let mut schedule_context = ScheduleContext::default();
+        let mut schedule_error = None;
+        let mut response_shards = Vec::new();
+        for tenant_shard_id in create_ids {
+            tracing::info!("Creating shard {tenant_shard_id}...");
 
-        let (waiters, response_shards) = {
+            let outcome = self
+                .do_initial_shard_scheduling(
+                    tenant_shard_id,
+                    initial_generation,
+                    &create_req.shard_parameters,
+                    create_req.config.clone(),
+                    placement_policy.clone(),
+                    &mut schedule_context,
+                )
+                .await;
+
+            match outcome {
+                InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp),
+                InitialShardScheduleOutcome::NotScheduled => {}
+                InitialShardScheduleOutcome::ShardScheduleError(err) => {
+                    schedule_error = Some(err);
+                }
+            }
+        }
+
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            response_shards
+                .iter()
+                .filter_map(|resp| {
+                    let az_id = locked
+                        .nodes
+                        .get(&resp.node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((resp.shard_id, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        // Note that we persist the preferred AZ for the new shards separately.
+        // In theory, we could "peek" the scheduler to determine where the shard will
+        // land, but the subsequent "real" call into the scheduler might select a different
+        // node. Hence, we do this awkward update to keep things consistent.
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            })?;
+
+        {
             let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
-
-            let mut response_shards = Vec::new();
-            let mut schcedule_error = None;
-
-            for tenant_shard_id in create_ids {
-                tracing::info!("Creating shard {tenant_shard_id}...");
-
-                use std::collections::btree_map::Entry;
-                match tenants.entry(tenant_shard_id) {
-                    Entry::Occupied(mut entry) => {
-                        tracing::info!(
-                            "Tenant shard {tenant_shard_id} already exists while creating"
-                        );
-
-                        // TODO: schedule() should take an anti-affinity expression that pushes
-                        // attached and secondary locations (independently) away frorm those
-                        // pageservers also holding a shard for this tenant.
-
-                        entry
-                            .get_mut()
-                            .schedule(scheduler, &mut schedule_context)
-                            .map_err(|e| {
-                                ApiError::Conflict(format!(
-                                    "Failed to schedule shard {tenant_shard_id}: {e}"
-                                ))
-                            })?;
-
-                        if let Some(node_id) = entry.get().intent.get_attached() {
-                            let generation = entry
-                                .get()
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-
-                        continue;
-                    }
-                    Entry::Vacant(entry) => {
-                        let state = entry.insert(TenantShard::new(
-                            tenant_shard_id,
-                            ShardIdentity::from_params(
-                                tenant_shard_id.shard_number,
-                                &create_req.shard_parameters,
-                            ),
-                            placement_policy.clone(),
-                        ));
-
-                        state.generation = initial_generation;
-                        state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
-                            schcedule_error = Some(e);
-                        }
-
-                        // Only include shards in result if we are attaching: the purpose
-                        // of the response is to tell the caller where the shards are attached.
-                        if let Some(node_id) = state.intent.get_attached() {
-                            let generation = state
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-                    }
-                };
+            for (tid, az_id) in updated {
+                if let Some(shard) = locked.tenants.get_mut(&tid) {
+                    shard.set_preferred_az(az_id);
+                }
             }
+        }
 
-            // If we failed to schedule shards, then they are still created in the controller,
-            // but we return an error to the requester to avoid a silent failure when someone
-            // tries to e.g. create a tenant whose placement policy requires more nodes than
-            // are present in the system.  We do this here rather than in the above loop, to
-            // avoid situations where we only create a subset of shards in the tenant.
-            if let Some(e) = schcedule_error {
-                return Err(ApiError::Conflict(format!(
-                    "Failed to schedule shard(s): {e}"
-                )));
-            }
+        // If we failed to schedule shards, then they are still created in the controller,
+        // but we return an error to the requester to avoid a silent failure when someone
+        // tries to e.g. create a tenant whose placement policy requires more nodes than
+        // are present in the system.  We do this here rather than in the above loop, to
+        // avoid situations where we only create a subset of shards in the tenant.
+        if let Some(e) = schedule_error {
+            return Err(ApiError::Conflict(format!(
+                "Failed to schedule shard(s): {e}"
+            )));
+        }
 
-            let waiters = tenants
+        let waiters = {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
                 .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
-                .collect::<Vec<_>>();
-            (waiters, response_shards)
+                .collect::<Vec<_>>()
         };
 
         Ok((
@@ -2149,6 +2146,78 @@ impl Service {
         ))
     }
 
+    /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
+    /// case of a new tenant and a pre-existing one.
+    async fn do_initial_shard_scheduling(
+        &self,
+        tenant_shard_id: TenantShardId,
+        initial_generation: Option<Generation>,
+        shard_params: &ShardParameters,
+        config: TenantConfig,
+        placement_policy: PlacementPolicy,
+        schedule_context: &mut ScheduleContext,
+    ) -> InitialShardScheduleOutcome {
+        let mut locked = self.inner.write().unwrap();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        use std::collections::btree_map::Entry;
+        match tenants.entry(tenant_shard_id) {
+            Entry::Occupied(mut entry) => {
+                tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
+
+                // TODO: schedule() should take an anti-affinity expression that pushes
+                // attached and secondary locations (independently) away frorm those
+                // pageservers also holding a shard for this tenant.
+
+                if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(err);
+                }
+
+                if let Some(node_id) = entry.get().intent.get_attached() {
+                    let generation = entry
+                        .get()
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+            Entry::Vacant(entry) => {
+                let state = entry.insert(TenantShard::new(
+                    tenant_shard_id,
+                    ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
+                    placement_policy,
+                ));
+
+                state.generation = initial_generation;
+                state.config = config;
+                if let Err(e) = state.schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(e);
+                }
+
+                // Only include shards in result if we are attaching: the purpose
+                // of the response is to tell the caller where the shards are attached.
+                if let Some(node_id) = state.intent.get_attached() {
+                    let generation = state
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+        }
+    }
+
     /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
     /// wait for reconciliation to complete before responding.
     async fn await_waiters(
@@ -3511,6 +3580,7 @@ impl Service {
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
                 scheduling_policy: *shard.get_scheduling_policy(),
+                preferred_az_id: shard.preferred_az().map(ToString::to_string),
             })
         }
 
@@ -4214,9 +4284,10 @@ impl Service {
                     config: serde_json::to_string(&config).unwrap(),
                     splitting: SplitState::Splitting,
 
-                    // Scheduling policies do not carry through to children
+                    // Scheduling policies and preferred AZ do not carry through to children
                     scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                         .unwrap(),
+                    preferred_az_id: None,
                 });
             }
 
@@ -4336,6 +4407,47 @@ impl Service {
         let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
+        // Now that we have scheduled the child shards, attempt to set their preferred AZ
+        // to that of the pageserver they've been attached on.
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            child_locations
+                .iter()
+                .filter_map(|(tid, node_id, _stripe_size)| {
+                    let az_id = locked
+                        .nodes
+                        .get(node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((*tid, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            });
+
+        match updated {
+            Ok(updated) => {
+                let mut locked = self.inner.write().unwrap();
+                for (tid, az_id) in updated {
+                    if let Some(shard) = locked.tenants.get_mut(&tid) {
+                        shard.set_preferred_az(az_id);
+                    }
+                }
+            }
+            Err(err) => {
+                tracing::warn!("Failed to persist preferred AZs after split: {err}");
+            }
+        }
+
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
@@ -6497,4 +6609,35 @@ impl Service {
     ) -> Result<(), DatabaseError> {
         self.persistence.safekeeper_upsert(record).await
     }
+
+    pub(crate) async fn update_shards_preferred_azs(
+        &self,
+        req: ShardsPreferredAzsRequest,
+    ) -> Result<ShardsPreferredAzsResponse, ApiError> {
+        let preferred_azs = req.preferred_az_ids.into_iter().collect::<Vec<_>>();
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred AZs: {err}"
+                ))
+            })?;
+
+        let mut updated_in_mem_and_db = Vec::default();
+
+        let mut locked = self.inner.write().unwrap();
+        for (tid, az_id) in updated {
+            let shard = locked.tenants.get_mut(&tid);
+            if let Some(shard) = shard {
+                shard.set_preferred_az(az_id);
+                updated_in_mem_and_db.push(tid);
+            }
+        }
+
+        Ok(ShardsPreferredAzsResponse {
+            updated: updated_in_mem_and_db,
+        })
+    }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 30723a3b36..cdb0633e2b 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -140,6 +140,10 @@ pub(crate) struct TenantShard {
     // Support/debug tool: if something is going wrong or flapping with scheduling, this may
     // be set to a non-active state to avoid making changes while the issue is fixed.
     scheduling_policy: ShardSchedulingPolicy,
+
+    // We should attempt to schedule this shard in the provided AZ to
+    // decrease chances of cross-AZ compute.
+    preferred_az_id: Option<String>,
 }
 
 #[derive(Default, Clone, Debug, Serialize)]
@@ -463,6 +467,7 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
+            preferred_az_id: None,
         }
     }
 
@@ -1297,6 +1302,7 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+            preferred_az_id: tsp.preferred_az_id,
         })
     }
 
@@ -1312,8 +1318,17 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
+            preferred_az_id: self.preferred_az_id.clone(),
         }
     }
+
+    pub(crate) fn preferred_az(&self) -> Option<&str> {
+        self.preferred_az_id.as_deref()
+    }
+
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
+        self.preferred_az_id = Some(preferred_az_id);
+    }
 }
 
 #[cfg(test)]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c692ceb69..18fbbde637 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2560,7 +2560,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def tenant_describe(self, tenant_id: TenantId):
         """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}
         """
         response = self.request(
             "GET",
@@ -2886,6 +2886,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 return None
             raise e
 
+    def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
+        response = self.request(
+            "PUT",
+            f"{self.api}/control/v1/preferred_azs",
+            headers=self.headers(TokenScope.ADMIN),
+            json={str(tid): az for tid, az in preferred_azs.items()},
+        )
+
+        response.raise_for_status()
+        return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8da42294b0..92cd74eba5 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2512,3 +2512,55 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
                 del d[key]
 
     return compared[0] == compared[1]
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
+    def assign_az(ps_cfg):
+        az = f"az-{ps_cfg['id']}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tids = [TenantId.generate() for _ in range(0, 3)]
+    for tid in tids:
+        env.storage_controller.tenant_create(tid)
+
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        attached_to = shards[0]["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+
+        assert shards[0]["preferred_az_id"] == expected_az
+
+    updated = env.storage_controller.set_preferred_azs(
+        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
+    )
+
+    assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
+
+    for tid in tids:
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        assert shards[0]["preferred_az_id"] == "foo"
+
+    # Generate a layer to avoid shard split handling on ps from tripping
+    # up on debug assert.
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_timeline("bar", tids[0], timeline_id)
+
+    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
+    env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
+    shards = env.storage_controller.tenant_describe(tids[0])["shards"]
+    assert len(shards) == 2
+    for shard in shards:
+        attached_to = shard["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == expected_az

From cbcd4058edb7a2c2bb3bfe1a6fc1ffb0d820b870 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:33:52 +0200
Subject: [PATCH 024/142] Fix 1.82 clippy lint too_long_first_doc_paragraph
 (#8941)

Addresses the 1.82 beta clippy lint `too_long_first_doc_paragraph` by
adding newlines to the first sentence if it is short enough, and making
a short first sentence if there is the need.
---
 compute_tools/src/pg_helpers.rs                    |  7 ++++---
 libs/metrics/src/lib.rs                            |  1 +
 libs/pageserver_api/src/controller_api.rs          |  2 ++
 libs/pageserver_api/src/models.rs                  | 10 +++++++---
 libs/postgres_backend/src/lib.rs                   |  6 ++++--
 libs/postgres_connection/src/lib.rs                |  1 +
 libs/remote_storage/src/lib.rs                     |  6 +++++-
 libs/tenant_size_model/src/lib.rs                  |  7 ++++---
 libs/utils/src/circuit_breaker.rs                  |  6 ++++--
 libs/utils/src/id.rs                               |  6 ++++--
 libs/utils/src/lock_file.rs                        |  4 +++-
 libs/utils/src/pageserver_feedback.rs              |  1 +
 libs/utils/src/poison.rs                           |  2 ++
 libs/utils/src/shard.rs                            |  9 +++++----
 libs/utils/src/simple_rcu.rs                       |  7 +++----
 libs/utils/src/sync/heavier_once_cell.rs           |  4 +++-
 libs/utils/src/vec_map.rs                          |  1 +
 libs/utils/src/yielding_loop.rs                    |  7 ++++---
 pageserver/src/config.rs                           |  2 ++
 pageserver/src/context.rs                          | 10 ++++++----
 pageserver/src/pgdatadir_mapping.rs                |  8 +++++---
 pageserver/src/tenant.rs                           |  9 +++++----
 pageserver/src/tenant/metadata.rs                  |  9 +++++----
 pageserver/src/tenant/mgr.rs                       | 12 +++++++-----
 pageserver/src/tenant/remote_timeline_client.rs    |  2 ++
 .../src/tenant/remote_timeline_client/index.rs     |  1 +
 pageserver/src/tenant/storage_layer.rs             |  9 +++++----
 pageserver/src/tenant/storage_layer/delta_layer.rs |  9 +++++----
 pageserver/src/tenant/storage_layer/image_layer.rs |  8 +++++---
 pageserver/src/tenant/storage_layer/layer_desc.rs  |  6 ++++--
 pageserver/src/tenant/storage_layer/layer_name.rs  |  5 +++--
 .../src/tenant/storage_layer/merge_iterator.rs     |  8 +++++---
 .../src/tenant/storage_layer/split_writer.rs       | 14 ++++++++------
 pageserver/src/tenant/vectored_blob_io.rs          |  6 ++++--
 pageserver/src/virtual_file.rs                     |  5 +++--
 pageserver/src/walredo.rs                          | 11 +++++------
 proxy/src/stream.rs                                |  1 +
 safekeeper/src/pull_timeline.rs                    |  1 +
 safekeeper/src/receive_wal.rs                      |  6 ++++--
 safekeeper/src/state.rs                            |  8 +++++---
 safekeeper/src/timeline.rs                         |  1 +
 safekeeper/src/timeline_eviction.rs                |  8 +++++---
 safekeeper/src/timeline_guard.rs                   |  4 +++-
 safekeeper/src/timeline_manager.rs                 |  1 +
 safekeeper/src/timelines_set.rs                    |  3 ++-
 safekeeper/src/wal_backup_partial.rs               |  6 ++++--
 safekeeper/src/wal_service.rs                      |  1 +
 storage_controller/src/service.rs                  |  4 +++-
 storage_scrubber/src/garbage.rs                    |  7 ++++---
 storage_scrubber/src/metadata_stream.rs            |  4 +++-
 storage_scrubber/src/pageserver_physical_gc.rs     |  7 ++++---
 51 files changed, 180 insertions(+), 103 deletions(-)

diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 863fa9468f..b2dc265864 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
-/// Escape a string for including it in a SQL literal. Wrapping the result
-/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
-/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// Escape a string for including it in a SQL literal.
+///
+/// Wrapping the result with `E'{}'` or `'{}'` is not required,
+/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
 /// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index df000cd0fb..cd4526c089 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -68,6 +68,7 @@ macro_rules! register_uint_gauge {
 static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
+///
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
 pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 94104af002..5c8dcbf571 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -147,6 +147,8 @@ pub struct TenantDescribeResponseShard {
     pub preferred_az_id: Option<String>,
 }
 
+/// Migration request for a given tenant shard to a given node.
+///
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d13d04eb1b..ffe79c8350 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,8 +305,10 @@ pub struct TenantConfig {
     pub lsn_lease_length_for_ts: Option<String>,
 }
 
-/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
-/// tenant config. When the first aux file written, the policy will be persisted in the
+/// The policy for the aux file storage.
+///
+/// It can be switched through `switch_aux_file_policy` tenant config.
+/// When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
@@ -896,7 +898,9 @@ pub struct WalRedoManagerStatus {
     pub process: Option<WalRedoManagerProcessStatus>,
 }
 
-/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
+/// The progress of a secondary tenant.
+///
+/// It is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 7c7c6535b3..600f1d728c 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -69,8 +69,10 @@ impl QueryError {
 }
 
 /// Returns true if the given error is a normal consequence of a network issue,
-/// or the client closing the connection. These errors can happen during normal
-/// operations, and don't indicate a bug in our code.
+/// or the client closing the connection.
+///
+/// These errors can happen during normal operations,
+/// and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
     use io::ErrorKind::*;
     matches!(
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 9f57f3d507..ddf9f7b610 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -7,6 +7,7 @@ use std::fmt;
 use url::Host;
 
 /// Parses a string of format either `host:port` or `host` into a corresponding pair.
+///
 /// The `host` part should be a correct `url::Host`, while `port` (if present) should be
 /// a valid decimal u16 of digits only.
 pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index cc1d3e0ae4..b5b69c9faf 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -45,6 +45,8 @@ pub use azure_core::Etag;
 
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 
+/// Default concurrency limit for S3 operations
+///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     ) -> Result<(), TimeTravelError>;
 }
 
-/// DownloadStream is sensitive to the timeout and cancellation used with the original
+/// Data part of an ongoing [`Download`].
+///
+/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
 /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
 /// with `tokio::io::copy_buf`.
 // This has 'static because safekeepers do not use cancellation tokens (yet)
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index a3e12cf0e3..974a498404 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -5,9 +5,10 @@
 mod calculation;
 pub mod svg;
 
-/// StorageModel is the input to the synthetic size calculation. It represents
-/// a tree of timelines, with just the information that's needed for the
-/// calculation. This doesn't track timeline names or where each timeline
+/// StorageModel is the input to the synthetic size calculation.
+///
+/// It represents a tree of timelines, with just the information that's needed
+/// for the calculation. This doesn't track timeline names or where each timeline
 /// begins and ends, for example. Instead, it consists of "points of interest"
 /// on the timelines. A point of interest could be the timeline start or end point,
 /// the oldest point on a timeline that needs to be retained because of PITR
diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs
index 720ea39d4f..e1ddfd8650 100644
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -5,8 +5,10 @@ use std::{
 
 use metrics::IntCounter;
 
-/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
-/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// Circuit breakers are for operations that are expensive and fallible.
+///
+/// If a circuit breaker fails repeatedly, we will stop attempting it for some
+/// period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
     /// An identifier that enables us to log useful errors when a circuit is broken
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index db468e3054..2cda899b15 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -249,8 +249,10 @@ macro_rules! id_newtype {
     };
 }
 
-/// Neon timeline IDs are different from PostgreSQL timeline
-/// IDs. They serve a similar purpose though: they differentiate
+/// Neon timeline ID.
+///
+/// They are different from PostgreSQL timeline
+/// IDs, but serve a similar purpose: they differentiate
 /// between different "histories" of the same cluster.  However,
 /// PostgreSQL timeline IDs are a bit cumbersome, because they are only
 /// 32-bits wide, and they must be in ascending order in any given
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 59c66ca757..3a2ed3e830 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -100,7 +100,9 @@ pub enum LockFileRead {
 }
 
 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
-/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// inspect its content.
+///
+/// It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
     let res = fs::OpenOptions::new().read(true).open(path);
diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index 3ddfa44f41..dede65e699 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -8,6 +8,7 @@ use tracing::{trace, warn};
 use crate::lsn::Lsn;
 
 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+///
 /// Serialized in custom flexible key/value format. In replication protocol, it
 /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
 /// Standby status update / Hot standby feedback messages.
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index 27378c69fc..c3e2fba20c 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -65,6 +65,8 @@ impl<T> Poison<T> {
     }
 }
 
+/// Armed pointer to a [`Poison`].
+///
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index f6b430657e..d146010b41 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);
 
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
+/// Combination of ShardNumber and ShardCount.
+///
+/// For use within the context of a particular tenant, when we need to know which shard we're
+/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
+/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
     pub shard_number: ShardNumber,
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index ecc5353be3..01750b2aef 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard};
 
 use tokio::sync::watch;
 
-///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
-/// (for very long).  Storing to the Rcu updates the value, making new readers
-/// immediately see the new value, but it also waits for all current readers to
-/// finish.
+/// (for very long).
 ///
+/// Storing to the Rcu updates the value, making new readers immediately see
+/// the new value, but it also waits for all current readers to finish.
 pub struct Rcu<V> {
     inner: RwLock<RcuInner<V>>,
 }
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 1abd3d9861..dc711fb028 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -5,7 +5,9 @@ use std::sync::{
 use tokio::sync::Semaphore;
 
 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// `SemaphorePermit`.
+///
+/// Allows use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 18b2af14f1..5f0028bacd 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -7,6 +7,7 @@ pub enum VecMapOrdering {
 }
 
 /// Ordered map datastructure implemented in a Vec.
+///
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 41c4cee45d..68274f0631 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -6,9 +6,10 @@ pub enum YieldingLoopError {
     Cancelled,
 }
 
-/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
-/// yields to avoid blocking the executor, and after resuming checks the provided
-/// cancellation token to drop out promptly on shutdown.
+/// Helper for long synchronous loops, e.g. over all tenants in the system.
+///
+/// Periodically yields to avoid blocking the executor, and after resuming
+/// checks the provided cancellation token to drop out promptly on shutdown.
 #[inline(always)]
 pub async fn yielding_loop<I, T, F>(
     interval: usize,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4e68e276d3..29a98855d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -180,6 +180,8 @@ pub struct PageServerConf {
     pub io_buffer_alignment: usize,
 }
 
+/// Token for authentication to safekeepers
+///
 /// We do not want to store this in a PageServerConf because the latter may be logged
 /// and/or serialized at a whim, while the token is secret. Currently this token is the
 /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 012cb8d96f..7afcf52cf2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,7 +1,9 @@
-//! This module defines `RequestContext`, a structure that we use throughout
-//! the pageserver to propagate high-level context from places
-//! that _originate_ activity down to the shared code paths at the
-//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//! Defines [`RequestContext`].
+//!
+//! It is a structure that we use throughout the pageserver to propagate
+//! high-level context from places that _originate_ activity down to the
+//! shared code paths at the heart of the pageserver. It's inspired by
+//! Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d28a214265..808d4b666e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1021,9 +1021,10 @@ impl Timeline {
 }
 
 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository. It is created by the 'begin_record'
-/// function. It is called for each WAL record, so that all the modifications
-/// by a one WAL record appear atomic.
+/// updates to the repository.
+///
+/// It is created by the 'begin_record' function. It is called for each WAL
+/// record, so that all the modifications by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
     /// The timeline this modification applies to. You can access this to
     /// read the state, but note that any pending updates are *not* reflected
@@ -2048,6 +2049,7 @@ impl<'a> DatadirModification<'a> {
 
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
+///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fb30857ddf..fd2520a42e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,8 +1,9 @@
+//! Timeline repository implementation that keeps old data in layer files, and
+//! the recent changes in ephemeral files.
 //!
-//! Timeline repository implementation that keeps old data in files on disk, and
-//! the recent changes in memory. See tenant/*_layer.rs files.
-//! The functions here are responsible for locating the correct layer for the
-//! get/put call, walking back the timeline branching history as needed.
+//! See tenant/*_layer.rs files. The functions here are responsible for locating
+//! the correct layer for the get/put call, walking back the timeline branching
+//! history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 190316df42..24440d4b35 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,7 +1,8 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
-//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
-//! this struct and it's original serialization format is still needed because they were written a
-//! long time ago.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata.
+//!
+//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
+//! their timelines, this struct and its original serialization format is still needed because
+//! they were written a long time ago.
 //!
 //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
 //! versioning.
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4e6ea0c8f9..2104f41531 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -282,9 +282,10 @@ impl BackgroundPurges {
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
-/// The TenantManager is responsible for storing and mutating the collection of all tenants
-/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
-/// lives inside the TenantManager.
+/// Responsible for storing and mutating the collection of all tenants
+/// that this pageserver has state for.
+///
+/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
 ///
 /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
 /// the same tenant twice concurrently, or trying to configure the same tenant into secondary
@@ -2346,8 +2347,9 @@ pub enum TenantMapError {
     ShuttingDown,
 }
 
-/// Guards a particular tenant_id's content in the TenantsMap.  While this
-/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
+/// Guards a particular tenant_id's content in the TenantsMap.
+///
+/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 71b766e4c7..1f9ae40af5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2184,6 +2184,8 @@ pub fn remote_timeline_path(
     remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }
 
+/// Obtains the path of the given Layer in the remote
+///
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 757fb9d032..c51ff54919 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -1,4 +1,5 @@
 //! In-memory index to track the tenant files on the remote storage.
+//!
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index a1202ad507..dac6b2f893 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -434,10 +434,11 @@ impl ReadableLayer {
     }
 }
 
-/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
-/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
-/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
-/// be used for cache management but not for correctness-critical checks.
+/// Layers contain a hint indicating whether they are likely to be used for reads.
+///
+/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
+/// when changing the visibility of layers (for example when creating a branch that makes some previously
+/// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LayerVisibilityHint {
     /// A Visible layer might be read while serving a read, because there is not an image layer between it
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6a2cd94232..34f1b15138 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -136,10 +136,11 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;
 
-/// Struct representing reference to BLOB in layers. Reference contains BLOB
-/// offset, and for WAL records it also contains `will_init` flag. The flag
-/// helps to determine the range of records that needs to be applied, without
-/// reading/deserializing records themselves.
+/// Struct representing reference to BLOB in layers.
+///
+/// Reference contains BLOB offset, and for WAL records it also contains
+/// `will_init` flag. The flag helps to determine the range of records
+/// that needs to be applied, without reading/deserializing records themselves.
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 77ce1ae670..875e223c9c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1,7 +1,9 @@
 //! An ImageLayer represents an image or a snapshot of a key-range at
-//! one particular LSN. It contains an image of all key-value pairs
-//! in its key-range. Any key that falls into the image layer's range
-//! but does not exist in the layer, does not exist.
+//! one particular LSN.
+//!
+//! It contains an image of all key-value pairs in its key-range. Any key
+//! that falls into the image layer's range but does not exist in the layer,
+//! does not exist.
 //!
 //! An image layer is stored in a file on disk. The file is stored in
 //! timelines/<timeline_id> directory.  Currently, there are no
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index cbd18e650f..e90ff3c4b2 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;
 
-/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
-/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// A unique identifier of a persistent layer.
+///
+/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
+/// This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 47ae556279..ffe7ca5f3e 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName {
     }
 }
 
-/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
-/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
+///
+/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
 /// over time (e.g. across shard splits or compression). The physical filenames of layers in local
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index d2c341e5ce..0831fd9530 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -226,9 +226,11 @@ impl<'a> IteratorWrapper<'a> {
     }
 }
 
-/// A merge iterator over delta/image layer iterators. When duplicated records are
-/// found, the iterator will not perform any deduplication, and the caller should handle
-/// these situation. By saying duplicated records, there are many possibilities:
+/// A merge iterator over delta/image layer iterators.
+///
+/// When duplicated records are found, the iterator will not perform any
+/// deduplication, and the caller should handle these situation. By saying
+/// duplicated records, there are many possibilities:
 ///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index e8deb0a1e5..7c1ac863bf 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -34,9 +34,10 @@ impl SplitWriterResult {
     }
 }
 
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
+/// An image writer that takes images and produces multiple image layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the image layer generation
+/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
@@ -193,9 +194,10 @@ impl SplitImageLayerWriter {
     }
 }
 
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
+/// A delta writer that takes key-lsn-values and produces multiple delta layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
+/// there might be leftover files to be cleaned up).
 ///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 4d51dc442d..553edf6d8b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -593,8 +593,10 @@ impl<'a> VectoredBlobReader<'a> {
     }
 }
 
-/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+///
+/// It provides a streaming API for getting read blobs. It returns a batch when
+/// `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<VectoredReadBuilder>,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index ed6ff86c10..57856eea80 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,6 +1,7 @@
-//!
 //! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor. Instead, the file is opened when it's read from,
+//! a file descriptor.
+//!
+//! Instead, the file is opened when it's read from,
 //! and if too many files are open globally in the system, least-recently
 //! used ones are closed.
 //!
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 82585f9ed8..a36955fa21 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,13 +43,12 @@ use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;
 
+/// The real implementation that uses a Postgres process to
+/// perform WAL replay.
 ///
-/// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. Only one thread can use the process at a time,
-/// that is controlled by the Mutex. In the future, we might want to
-/// launch a pool of processes to allow concurrent replay of multiple
-/// records.
-///
+/// Only one thread can use the process at a time, that is controlled by the
+/// Mutex. In the future, we might want to launch a pool of processes to allow
+/// concurrent replay of multiple records.
 pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 332dc27787..c14dd18afe 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -14,6 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 
 /// Stream wrapper which implements libpq's protocol.
+///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
 /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
 /// to pass random malformed bytes through the connection).
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 600a6bd8f0..64585f5edc 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -484,6 +484,7 @@ pub async fn validate_temp_timeline(
 }
 
 /// Move timeline from a temp directory to the main storage, and load it to the global map.
+///
 /// This operation is done under a lock to prevent bugs if several concurrent requests are
 /// trying to load the same timeline. Note that it doesn't guard against creating the
 /// timeline with the same ttid, but no one should be doing this anyway.
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index ab8c76dc17..e35f806e90 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -448,8 +448,10 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
-/// replies to reply_tx; reading from socket and writing to disk in parallel is
-/// beneficial for performance, this struct provides writing to disk part.
+/// replies to reply_tx.
+///
+/// Reading from socket and writing to disk in parallel is beneficial for
+/// performance, this struct provides the writing to disk part.
 pub struct WalAcceptor {
     tli: WalResidentTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index dca6414082..97eeae3638 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -147,9 +147,11 @@ pub struct TimelineMemState {
     pub proposer_uuid: PgUuid,
 }
 
-/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
-/// when we update fields like commit_lsn which don't need immediate
-/// persistence. Provides transactional like API to atomically update the state.
+/// Safekeeper persistent state plus in memory layer.
+///
+/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
+/// which don't need immediate persistence. Provides transactional like API
+/// to atomically update the state.
 ///
 /// Implements Deref into *persistent* part.
 pub struct TimelineState<CTRL: control_file::Storage> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 95ee925e1a..6fd5de0ad6 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -169,6 +169,7 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
 }
 
 /// This structure is stored in shared state and represents the state of the timeline.
+///
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 5d0567575c..5aa4921a92 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -1,6 +1,8 @@
-//! Code related to evicting WAL files to remote storage. The actual upload is done by the
-//! partial WAL backup code. This file has code to delete and re-download WAL files,
-//! cross-validate with partial WAL backup if local file is still present.
+//! Code related to evicting WAL files to remote storage.
+//!
+//! The actual upload is done by the partial WAL backup code. This file has
+//! code to delete and re-download WAL files, cross-validate with partial WAL
+//! backup if local file is still present.
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index dbdf46412d..1ddac573d2 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -1,4 +1,6 @@
-//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
+//! Timeline residence guard
+//!
+//! It is needed to ensure that WAL segments are present on disk,
 //! as long as the code is holding the guard. This file implements guard logic, to issue
 //! and drop guards, and to notify the manager when the guard is dropped.
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index f997f48454..6be75479db 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -1,4 +1,5 @@
 //! The timeline manager task is responsible for managing the timeline's background tasks.
+//!
 //! It is spawned alongside each timeline and exits when the timeline is deleted.
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
index d6eea79f82..096e348295 100644
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -60,7 +60,8 @@ impl TimelinesSet {
     }
 }
 
-/// Guard is used to add or remove timeline from the set.
+/// Guard is used to add or remove timelines from the set.
+///
 /// If the timeline present in set, it will be removed from it on drop.
 /// Note: do not use more than one guard for the same timeline, it caches the presence state.
 /// It is designed to be used in the manager task only.
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4050a82fff..bddfca50e4 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -1,6 +1,8 @@
 //! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
-//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
-//! was changed), the segment will be uploaded to S3 in about 15 minutes.
+//! and `flush_lsn` updates.
+//!
+//! After the partial segment was updated (`flush_lsn` was changed), the segment
+//! will be uploaded to S3 within the configured `partial_backup_timeout`.
 //!
 //! The filename format for partial segments is
 //! `Segment_Term_Flush_Commit_skNN.partial`, where:
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 16f7748eb4..1ab54d4cce 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -17,6 +17,7 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
+///
 /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 324f864291..e7eae647df 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -117,7 +117,9 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
 /// How long a node may be unresponsive to heartbeats during start up before we declare it
-/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
+/// offline.
+///
+/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
 /// handling of the re-attach response may take a long time and blocks heartbeats from
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 3e22960f8d..d53611ed6e 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -1,6 +1,7 @@
-//! Functionality for finding and purging garbage, as in "garbage collection".  Garbage means
-//! S3 objects which are either not referenced by any metadata, or are referenced by a
-//! control plane tenant/timeline in a deleted state.
+//! Functionality for finding and purging garbage, as in "garbage collection".
+//!
+//! Garbage means S3 objects which are either not referenced by any metadata,
+//! or are referenced by a control plane tenant/timeline in a deleted state.
 
 use std::{
     collections::{HashMap, HashSet},
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 10d77937f1..f896cff2d5 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -74,7 +74,9 @@ pub async fn stream_tenant_shards<'a>(
 }
 
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
-/// using a listing. The listing is done before the stream is built, so that this
+/// using a listing.
+///
+/// The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
     remote_client: &'a GenericRemoteStorage,
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 88681e38c2..c96d9cad3b 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -440,9 +440,10 @@ async fn gc_ancestor(
     Ok(())
 }
 
-/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
-/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
-/// is about removing:
+/// Physical garbage collection: removing unused S3 objects.
+///
+/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
+/// (keys, layers).  This type of garbage collection is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
 /// - Index objects from historic generations

From e287f36a058221b7c804b4b0f440933962eb3deb Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 15:23:57 +0300
Subject: [PATCH 025/142] safekeeper: fix endpoint restart immediately after
 xlog switch.

Check that truncation point is not from the future by comparing it with
write_record_lsn, not write_lsn, and explain that xlog switch changes
their normal order.

ref https://github.com/neondatabase/neon/issues/8911
---
 safekeeper/src/safekeeper.rs             |  3 ++-
 safekeeper/src/wal_storage.rs            | 23 ++++++++++++++++++-----
 test_runner/regress/test_wal_acceptor.py | 18 ++++++++++++++++++
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index dbe0034de2..b3e006ab05 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -938,8 +938,9 @@ where
         }
 
         trace!(
-            "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
+            "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
             msg.wal_data.len(),
+            msg.h.begin_lsn,
             msg.h.end_lsn,
             msg.h.commit_lsn,
             msg.h.truncate_lsn,
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 89c2e98a94..c477fe5c7b 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -98,7 +98,19 @@ pub struct PhysicalStorage {
     /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
     write_lsn: Lsn,
 
-    /// The LSN of the last WAL record written to disk. Still can be not fully flushed.
+    /// The LSN of the last WAL record written to disk. Still can be not fully
+    /// flushed.
+    ///
+    /// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
+    /// switch ingest the reverse is true because we don't bump write_lsn up to
+    /// the next segment: WAL stream from the compute doesn't have the gap and
+    /// for simplicity / as a sanity check we disallow any non-sequential
+    /// writes, so write zeros as is.
+    ///
+    /// Similar effect is in theory possible due to LSN alignment: if record
+    /// ends at *2, decoder will report end lsn as *8 even though we haven't
+    /// written these zeros yet. In practice compute likely never sends
+    /// non-aligned chunks of data.
     write_record_lsn: Lsn,
 
     /// The LSN of the last WAL record flushed to disk.
@@ -440,11 +452,12 @@ impl Storage for PhysicalStorage {
             .with_label_values(&["truncate_wal"])
             .start_timer();
 
-        // Streaming must not create a hole, so truncate cannot be called on non-written lsn
-        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
+        // Streaming must not create a hole, so truncate cannot be called on
+        // non-written lsn.
+        if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
             bail!(
-                "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
-                self.write_lsn,
+                "truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
+                self.write_record_lsn,
                 end_pos
             );
         }
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3785651aed..5672e836ee 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1057,6 +1057,24 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
         endpoint.start()
 
 
+# Try restarting endpoint immediately after xlog switch.
+# https://github.com/neondatabase/neon/issues/8911
+def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start("main")
+
+    endpoint.safe_psql("create table t (i int)")
+
+    endpoint.safe_psql("SELECT pg_switch_wal()")
+
+    # we want immediate shutdown to have endpoint restart on xlog switch record,
+    # so prevent shutdown checkpoint.
+    endpoint.stop(mode="immediate")
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("SELECT 'works'")
+
+
 # Context manager which logs passed time on exit.
 class DurationLogger:
     def __init__(self, desc):

From af6f63617e7421fca62ad2bf7ebfe2f0de66a793 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 6 Sep 2024 17:13:30 +0200
Subject: [PATCH 026/142] proxy: clean up code and lints for 1.81 and 1.82
 (#8945)

---
 proxy/src/cache/timed_lru.rs          |  2 +-
 proxy/src/lib.rs                      | 25 +++++++++++++++----------
 proxy/src/scram/exchange.rs           |  2 ++
 proxy/src/serverless/sql_over_http.rs |  9 +++------
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 8bb482f7c6..5b08d74696 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -16,7 +16,7 @@ use tracing::debug;
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
 use hashlink::{linked_hash_map::RawEntryMut, LruCache};
 
-use super::{common::Cached, *};
+use super::{common::Cached, timed_lru, Cache};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 8d7e586b3d..923d6ae288 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -44,16 +44,14 @@
     clippy::items_after_statements,
 )]
 // List of temporarily allowed lints.
-// TODO: Switch to except() once stable with 1.81.
 // TODO: fix code and reduce list or move to permanent list above.
-#![allow(
+#![expect(
     clippy::cargo_common_metadata,
     clippy::cast_possible_truncation,
     clippy::cast_possible_wrap,
     clippy::cast_precision_loss,
     clippy::cast_sign_loss,
     clippy::doc_markdown,
-    clippy::implicit_hasher,
     clippy::inline_always,
     clippy::match_same_arms,
     clippy::match_wild_err_arm,
@@ -61,21 +59,28 @@
     clippy::missing_panics_doc,
     clippy::module_name_repetitions,
     clippy::needless_pass_by_value,
-    clippy::needless_raw_string_hashes,
     clippy::redundant_closure_for_method_calls,
-    clippy::return_self_not_must_use,
     clippy::similar_names,
     clippy::single_match_else,
     clippy::struct_excessive_bools,
     clippy::struct_field_names,
     clippy::too_many_lines,
-    clippy::unreadable_literal,
-    clippy::unused_async,
-    clippy::unused_self,
-    clippy::wildcard_imports
+    clippy::unused_self
+)]
+#![cfg_attr(
+    any(test, feature = "testing"),
+    allow(
+        clippy::needless_raw_string_hashes,
+        clippy::unreadable_literal,
+        clippy::unused_async,
+    )
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(unknown_lints, clippy::manual_inspect)]
+#![allow(
+    unknown_lints,
+    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
+    impl_trait_overcaptures,
+)]
 
 use std::{convert::Infallible, future::Future};
 
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 786cbcaa19..afb5604666 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -217,6 +217,7 @@ impl sasl::Mechanism for Exchange<'_> {
                         self.state = ExchangeState::SaltSent(sent);
                         Ok(Step::Continue(self, msg))
                     }
+                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                     Step::Success(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
@@ -224,6 +225,7 @@ impl sasl::Mechanism for Exchange<'_> {
             ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                     Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
+                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                     Step::Continue(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 5b36f5e91d..2188edc8c5 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -745,22 +745,20 @@ impl BatchQueryData {
             builder = builder.deferrable(true);
         }
 
-        let transaction = builder.start().await.map_err(|e| {
+        let transaction = builder.start().await.inspect_err(|_| {
             // if we cannot start a transaction, we should return immediately
             // and not return to the pool. connection is clearly broken
             discard.discard();
-            e
         })?;
 
         let json_output =
             match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                 Ok(json_output) => {
                     info!("commit");
-                    let status = transaction.commit().await.map_err(|e| {
+                    let status = transaction.commit().await.inspect_err(|_| {
                         // if we cannot commit - for now don't return connection to pool
                         // TODO: get a query status from the error
                         discard.discard();
-                        e
                     })?;
                     discard.check_idle(status);
                     json_output
@@ -776,11 +774,10 @@ impl BatchQueryData {
                 }
                 Err(err) => {
                     info!("rollback");
-                    let status = transaction.rollback().await.map_err(|e| {
+                    let status = transaction.rollback().await.inspect_err(|_| {
                         // if we cannot rollback - for now don't return connection to pool
                         // TODO: get a query status from the error
                         discard.discard();
-                        e
                     })?;
                     discard.check_idle(status);
                     return Err(err);

From 11cf16e3f363ce027e53b1834a77858d50daee0d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 19 Aug 2024 14:42:07 +0300
Subject: [PATCH 027/142] safekeeper: add term_bump endpoint.

When walproposer observes now higher term it restarts instead of
crashing whole compute with PANIC; this avoids compute crash after
term_bump call. After successfull election we're still checking
last_log_term of the highest given vote to ensure basebackup is good,
and PANIC otherwise.

It will be used for migration per
035-safekeeper-dynamic-membership-change.md
and
https://github.com/neondatabase/docs/pull/21

ref https://github.com/neondatabase/neon/issues/8700
---
 libs/safekeeper_api/src/models.rs        | 13 +++++++++
 pgxn/neon/walproposer.c                  | 24 ++++++++++-----
 safekeeper/src/auth.rs                   |  3 ++
 safekeeper/src/http/routes.rs            | 28 +++++++++++++++++-
 safekeeper/src/state.rs                  | 26 +++++++++++++++--
 safekeeper/src/timeline.rs               | 10 +++++++
 test_runner/fixtures/safekeeper/http.py  | 29 +++++++++++++++++++
 test_runner/regress/test_wal_acceptor.py | 37 ++++++++++++++++++++++++
 8 files changed, 159 insertions(+), 11 deletions(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 2fbc333075..28666d197a 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -60,3 +60,16 @@ pub struct TimelineCopyRequest {
     pub target_timeline_id: TimelineId,
     pub until_lsn: Lsn,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpRequest {
+    /// bump to
+    pub term: Option<u64>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpResponse {
+    // before the request
+    pub previous_term: u64,
+    pub current_term: u64,
+}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index c53257923a..c1914421ec 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if previously elected leader was me;
-			 * plain restart of walproposer not intervened by concurrent
-			 * compute (who could generate WAL) is ok.
+			 * However, allow to proceed if last_log_term on the node which gave
+			 * the highest vote (i.e. point where we are going to start writing)
+			 * actually had been won by me; plain restart of walproposer not
+			 * intervened by concurrent compute which wrote WAL is ok.
+			 *
+			 * This avoids compute crash after manual term_bump.
 			 */
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
@@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk)
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
-			 * Another compute with higher term is running. Panic to restart
-			 * PG as we likely need to retake basebackup. However, don't dump
-			 * core as this is kinda expected scenario.
+			 *
+			 * Term has changed to higher one, probably another compute is
+			 * running. If this is the case we could PANIC as well because
+			 * likely it inserted some data and our basebackup is unsuitable
+			 * anymore. However, we also bump term manually (term_bump endpoint)
+			 * on safekeepers for migration purposes, in this case we do want
+			 * compute to stay alive. So restart walproposer with FATAL instead
+			 * of panicking; if basebackup is spoiled next election will notice
+			 * this.
 			 */
-			disable_core_dump();
-			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
+			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
 				   sk->appendResponse.term, wp->propTerm);
 		}
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index b8bc3f3e06..c5c9393c00 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,6 +1,9 @@
 use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;
 
+/// If tenant_id is provided, allow if token (claims) is for this tenant or
+/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
+/// SafekeeperData.
 pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
     match (&claims.scope, tenant_id) {
         (Scope::Tenant, None) => Err(AuthError(
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9b7424a818..e482edea55 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
 use utils::http::request::parse_query_param;
 
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::TimelineCreateRequest;
 use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
+use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
 use utils::{
     auth::SwappableJwtAuth,
     http::{
@@ -408,6 +408,28 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
     json_response(StatusCode::OK, response)
 }
 
+/// Make term at least as high as one in request. If one in request is None,
+/// increment current one.
+async fn timeline_term_bump_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let response = tli
+        .term_bump(request_data.term)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -630,6 +652,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
             |r| request_span(r, timeline_backup_partial_reset),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
+            |r| request_span(r, timeline_term_bump_handler),
+        )
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 97eeae3638..8ae749ded5 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,9 +1,10 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).
 
-use std::ops::Deref;
+use std::{cmp::max, ops::Deref};
 
 use anyhow::Result;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -12,7 +13,7 @@ use utils::{
 
 use crate::{
     control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
     wal_backup_partial::{self},
 };
 
@@ -211,6 +212,27 @@ where
         let s = self.start_change();
         self.finish_change(&s).await
     }
+
+    /// Make term at least as `to`. If `to` is None, increment current one. This
+    /// is not in safekeeper.rs because we want to be able to do it even if
+    /// timeline is offloaded.
+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let before = self.acceptor_state.term;
+        let mut state = self.start_change();
+        let new = match to {
+            Some(to) => max(state.acceptor_state.term, to),
+            None => state.acceptor_state.term + 1,
+        };
+        if new > state.acceptor_state.term {
+            state.acceptor_state.term = new;
+            self.finish_change(&state).await?;
+        }
+        let after = self.acceptor_state.term;
+        Ok(TimelineTermBumpResponse {
+            previous_term: before,
+            current_term: after,
+        })
+    }
 }
 
 impl<CTRL> Deref for TimelineState<CTRL>
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 6fd5de0ad6..fb98534768 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,6 +4,7 @@
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -215,6 +216,10 @@ impl StateSK {
             .get_last_log_term(self.flush_lsn())
     }
 
+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        self.state_mut().term_bump(to).await
+    }
+
     /// Close open WAL files to release FDs.
     fn close_wal_store(&mut self) {
         if let StateSK::Loaded(sk) = self {
@@ -854,6 +859,11 @@ impl Timeline {
         Ok(res)
     }
 
+    pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let mut state = self.write_shared_state().await;
+        state.sk.term_bump(to).await
+    }
+
     /// Get the timeline guard for reading/writing WAL files.
     /// If WAL files are not present on disk (evicted), they will be automatically
     /// downloaded from remote storage. This is done in the manager task, which is
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 9bf03554e7..96c84d1616 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -50,6 +50,19 @@ class SafekeeperMetrics(Metrics):
         ).value
 
 
+@dataclass
+class TermBumpResponse:
+    previous_term: int
+    current_term: int
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> "TermBumpResponse":
+        return TermBumpResponse(
+            previous_term=d["previous_term"],
+            current_term=d["current_term"],
+        )
+
+
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
@@ -252,6 +265,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         res.raise_for_status()
         return res.json()
 
+    def term_bump(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        term: Optional[int],
+    ) -> TermBumpResponse:
+        body = {}
+        if term is not None:
+            body["term"] = term
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/term_bump",
+            json=body,
+        )
+        res.raise_for_status()
+        return TermBumpResponse.from_json(res.json())
+
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
         res = self.post(
             f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5672e836ee..50fac441c0 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2194,6 +2194,43 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
     assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
 
 
+def test_term_bump(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    # initialize safekeeper
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    http_cli = env.safekeepers[0].http_client()
+
+    # check that bump up to specific term works
+    curr_term = http_cli.timeline_status(tenant_id, timeline_id).term
+    bump_to = curr_term + 3
+    res = http_cli.term_bump(tenant_id, timeline_id, bump_to)
+    log.info(f"bump to {bump_to} res: {res}")
+    assert res.current_term >= bump_to
+
+    # check that bump to none increments current term
+    res = http_cli.term_bump(tenant_id, timeline_id, None)
+    log.info(f"bump to None res: {res}")
+    assert res.current_term > bump_to
+    assert res.current_term > res.previous_term
+
+    # check that bumping doesn't work downward
+    res = http_cli.term_bump(tenant_id, timeline_id, 2)
+    log.info(f"bump to 2 res: {res}")
+    assert res.current_term > bump_to
+    assert res.current_term == res.previous_term
+
+    # check that this doesn't kill endpoint because last WAL flush was his and
+    # thus its basebackup is still good
+    endpoint.safe_psql("insert into t values (1, 'payload')")
+
+
 # Test disables periodic pushes from safekeeper to the broker and checks that
 # pageserver can still discover safekeepers with discovery requests.
 def test_broker_discovery(neon_env_builder: NeonEnvBuilder):

From 8eab7009c11ebc03f09b2f3916e642664a4b9f88 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 16:54:45 +0300
Subject: [PATCH 028/142] safekeeper: do pid file lock before id init

---
 safekeeper/src/bin/safekeeper.rs | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 41c2d3fe08..644d5e6eaf 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -261,6 +261,15 @@ async fn main() -> anyhow::Result<()> {
     // Change into the data directory.
     std::env::set_current_dir(&workdir)?;
 
+    // Prevent running multiple safekeepers on the same directory
+    let lock_file_path = workdir.join(PID_FILE_NAME);
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("claimed pid file at {lock_file_path:?}");
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    std::mem::forget(lock_file);
+
     // Set or read our ID.
     let id = set_id(&workdir, args.id.map(NodeId))?;
     if args.init {
@@ -364,16 +373,6 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
-    // Prevent running multiple safekeepers on the same directory
-    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file =
-        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
-    info!("claimed pid file at {lock_file_path:?}");
-
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
-    std::mem::forget(lock_file);
-
     info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);

From c1a51416dbfadbf05cc352168ebc6fc4a83c6f59 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 17:40:21 +0300
Subject: [PATCH 029/142] safekeeper: fsync filesystem on start.

We can't really rely on files contents after boot without fsync'ing
them.
---
 libs/utils/src/crashsafe.rs      | 21 +++++++++++++++++++++
 pageserver/src/bin/pageserver.rs | 19 ++-----------------
 safekeeper/src/bin/safekeeper.rs | 12 +++++++++++-
 safekeeper/src/wal_storage.rs    |  3 +--
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 756b19138c..946fedf6e5 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,9 +1,11 @@
+use std::os::fd::AsRawFd;
 use std::{
     borrow::Cow,
     fs::{self, File},
     io::{self, Write},
 };
 
+use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 /// Similar to [`std::fs::create_dir`], except we fsync the
@@ -203,6 +205,25 @@ pub fn overwrite(
     Ok(())
 }
 
+/// Syncs the filesystem for the given file descriptor.
+pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
+    // Linux guarantees durability for syncfs.
+    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+    #[cfg(target_os = "linux")]
+    {
+        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
+    }
+    #[cfg(target_os = "macos")]
+    {
+        // macOS is not a production platform for Neon, don't even bother.
+    }
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+    {
+        compile_error!("Unsupported OS");
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 59194ab4bd..d15a0e47a4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -37,6 +37,7 @@ use pageserver::{
     virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::crashsafe::syncfs;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::{
@@ -155,23 +156,7 @@ fn main() -> anyhow::Result<()> {
         };
 
         let started = Instant::now();
-        // Linux guarantees durability for syncfs.
-        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-        #[cfg(target_os = "linux")]
-        {
-            use std::os::fd::AsRawFd;
-            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
-        }
-        #[cfg(target_os = "macos")]
-        {
-            // macOS is not a production platform for Neon, don't even bother.
-            drop(dirfd);
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            compile_error!("Unsupported OS");
-        }
-
+        syncfs(dirfd)?;
         let elapsed = started.elapsed();
         info!(
             elapsed_ms = elapsed.as_millis(),
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 644d5e6eaf..5270934f5e 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -19,7 +19,7 @@ use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use storage_broker::Uri;
 
 use tracing::*;
@@ -373,6 +373,16 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+    // fsync the datadir to make sure we have a consistent state on disk.
+    let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
+    let started = Instant::now();
+    utils::crashsafe::syncfs(dfd)?;
+    let elapsed = started.elapsed();
+    info!(
+        elapsed_ms = elapsed.as_millis(),
+        "syncfs data directory done"
+    );
+
     info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index c477fe5c7b..46c260901d 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -179,8 +179,7 @@ impl PhysicalStorage {
             )
         };
 
-        // TODO: do we really know that write_lsn is fully flushed to disk?
-        //      If not, maybe it's better to call fsync() here to be sure?
+        // note: this assumes we fsync'ed whole datadir on start.
         let flush_lsn = write_lsn;
 
         debug!(

From 30583cb6264653175d659d0fcb636a42e21c5877 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 6 Sep 2024 17:42:35 +0100
Subject: [PATCH 030/142] CI(label-for-external-users): add retry logic for
 unexpected errors (#8938)

## Problem

One of the PRs opened by a `neondatabase` org member got labelled as
`external` because the `gh api` call failed in the wrong way:
```
Get "https://api.github.com/orgs/neondatabase/members/<username>": dial tcp 140.82.114.5:443: i/o timeout
is-member=false
```

## Summary of changes
- Check that the error message is expected before labelling PRs
- Retry `gh api` call for 10 times in case of unexpected error messages
- Add `workflow_dispatch` trigger
---
 .../workflows/label-for-external-users.yml    | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 585d118dfb..b7cbc06a73 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -7,6 +7,11 @@ on:
   pull_request_target:
     types:
       - opened
+  workflow_dispatch:
+    inputs:
+      github-actor:
+        description: 'GitHub username. If empty, the username of the current user will be used'
+        required: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -26,12 +31,31 @@ jobs:
       id: check-user
       env:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        ACTOR: ${{ inputs.github-actor || github.actor }}
       run: |
-        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
-          is_member=true
-        else
-          is_member=false
-        fi
+        expected_error="User does not exist or is not a member of the organization"
+        output_file=output.txt
+
+        for i in $(seq 1 10); do
+          if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
+
+            is_member=true
+            break
+          elif grep -q "${expected_error}" ${output_file}; then
+            is_member=false
+            break
+          elif [ $i -eq 10 ]; then
+            title="Failed to get memmbership status for ${ACTOR}"
+            message="The latest GitHub API error message: '$(cat ${output_file})'"
+            echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
+
+            exit 1
+          fi
+
+          sleep 1
+        done
 
         echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
 

From ac5815b5940c412a281c6bbab34809689a738da7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 7 Sep 2024 02:14:21 +0800
Subject: [PATCH 031/142] feat(storage-controller): add node shards api (#8896)

For control-plane managed tenants, we have the page in the admin console
that lists all tenants on a specific pageserver. But for
storage-controller managed ones, we don't have that functionality for
now.

## Summary of changes

Adds an API that lists all shards on a given node (intention + observed)

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/storcon_cli/src/main.rs         | 45 +++++++++++++++--
 libs/pageserver_api/src/controller_api.rs     | 15 ++++++
 storage_controller/src/http.rs                | 18 +++++++
 storage_controller/src/service.rs             | 49 +++++++++++++++++--
 test_runner/fixtures/common_types.py          |  8 +++
 test_runner/fixtures/neon_fixtures.py         | 26 +++++++++-
 .../regress/test_storage_controller.py        |  6 +++
 7 files changed, 157 insertions(+), 10 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2a81a3d825..651fcda8db 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -80,7 +80,10 @@ enum Command {
     /// List nodes known to the storage controller
     Nodes {},
     /// List tenants known to the storage controller
-    Tenants {},
+    Tenants {
+        /// If this field is set, it will list the tenants on a specific node
+        node_id: Option<NodeId>,
+    },
     /// Create a new tenant in the storage controller, and by extension on pageservers.
     TenantCreate {
         #[arg(long)]
@@ -403,7 +406,41 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
-        Command::Tenants {} => {
+        Command::Tenants {
+            node_id: Some(node_id),
+        } => {
+            let describe_response = storcon_client
+                .dispatch::<(), NodeShardResponse>(
+                    Method::GET,
+                    format!("control/v1/node/{node_id}/shards"),
+                    None,
+                )
+                .await?;
+            let shards = describe_response.shards;
+            let mut table = comfy_table::Table::new();
+            table.set_header([
+                "Shard",
+                "Intended Primary/Secondary",
+                "Observed Primary/Secondary",
+            ]);
+            for shard in shards {
+                table.add_row([
+                    format!("{}", shard.tenant_shard_id),
+                    match shard.is_intended_secondary {
+                        None => "".to_string(),
+                        Some(true) => "Secondary".to_string(),
+                        Some(false) => "Primary".to_string(),
+                    },
+                    match shard.is_observed_secondary {
+                        None => "".to_string(),
+                        Some(true) => "Secondary".to_string(),
+                        Some(false) => "Primary".to_string(),
+                    },
+                ]);
+            }
+            println!("{table}");
+        }
+        Command::Tenants { node_id: None } => {
             let mut resp = storcon_client
                 .dispatch::<(), Vec<TenantDescribeResponse>>(
                     Method::GET,
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 5c8dcbf571..40b7dbbbc2 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -112,6 +112,21 @@ pub struct TenantDescribeResponse {
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct NodeShardResponse {
+    pub node_id: NodeId,
+    pub shards: Vec<NodeShard>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct NodeShard {
+    pub tenant_shard_id: TenantShardId,
+    /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
+    pub is_observed_secondary: Option<bool>,
+    /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
+    pub is_intended_secondary: Option<bool>,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
     pub id: NodeId,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5d4d0460be..96bdd5039d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -539,6 +539,17 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, node_status)
 }
 
+async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    let node_status = state.service.get_node_shards(node_id).await?;
+
+    json_response(StatusCode::OK, node_status)
+}
+
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1109,6 +1120,13 @@ pub fn make_router(
         .get("/control/v1/node/:node_id", |r| {
             named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
         })
+        .get("/control/v1/node/:node_id/shards", |r| {
+            named_request_span(
+                r,
+                handle_node_shards,
+                RequestName("control_v1_node_describe"),
+            )
+        })
         .get("/control/v1/leader", |r| {
             named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e7eae647df..44fdb474b4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -41,11 +41,11 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
-        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -4924,6 +4924,45 @@ impl Service {
             ))
     }
 
+    pub(crate) async fn get_node_shards(
+        &self,
+        node_id: NodeId,
+    ) -> Result<NodeShardResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+        let mut shards = Vec::new();
+        for (tid, tenant) in locked.tenants.iter() {
+            let is_intended_secondary = match (
+                tenant.intent.get_attached() == &Some(node_id),
+                tenant.intent.get_secondary().contains(&node_id),
+            ) {
+                (true, true) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "{} attached as primary+secondary on the same node",
+                        tid
+                    )))
+                }
+                (true, false) => Some(false),
+                (false, true) => Some(true),
+                (false, false) => None,
+            };
+            let is_observed_secondary = if let Some(ObservedStateLocation { conf: Some(conf) }) =
+                tenant.observed.locations.get(&node_id)
+            {
+                Some(conf.secondary_conf.is_some())
+            } else {
+                None
+            };
+            if is_intended_secondary.is_some() || is_observed_secondary.is_some() {
+                shards.push(NodeShard {
+                    tenant_shard_id: *tid,
+                    is_intended_secondary,
+                    is_observed_secondary,
+                });
+            }
+        }
+        Ok(NodeShardResponse { node_id, shards })
+    }
+
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         self.persistence.get_leader().await
     }
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 8eda19d1e2..064a678c96 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -140,6 +140,14 @@ class TenantId(Id):
         return self.id.hex()
 
 
+class NodeId(Id):
+    def __repr__(self) -> str:
+        return f'`NodeId("{self.id.hex()}")'
+
+    def __str__(self) -> str:
+        return self.id.hex()
+
+
 class TimelineId(Id):
     def __repr__(self) -> str:
         return f'TimelineId("{self.id.hex()}")'
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 18fbbde637..5a600dd0a1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -62,7 +62,7 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
@@ -2570,6 +2570,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return response.json()
 
+    def nodes(self):
+        """
+        :return: list of {"id": ""}
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/node",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def node_shards(self, node_id: NodeId):
+        """
+        :return: list of {"shard_id": "", "is_secondary": bool}
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/node/{node_id}/shards",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def tenant_shard_split(
         self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
     ) -> list[TenantShardId]:
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 92cd74eba5..eea05d7548 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1552,6 +1552,12 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
     literal_shard_count = 1 if shard_count is None else shard_count
     assert len(describe["shards"]) == literal_shard_count
 
+    nodes = env.storage_controller.nodes()
+    assert len(nodes) == 2
+    describe1 = env.storage_controller.node_shards(nodes[0]["id"])
+    describe2 = env.storage_controller.node_shards(nodes[1]["id"])
+    assert len(describe1["shards"]) + len(describe2["shards"]) == literal_shard_count
+
     # Check the data is still there: this implicitly proves that we recovered generation numbers
     # properly, for the timeline which was written to after a generation bump.
     for timeline, branch, expect_rows in [

From fa3fc73c1b3366a3316456bcb8fdce1bed159200 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 21:05:18 +0200
Subject: [PATCH 032/142] Address 1.82 clippy lints (#8944)

Addresses the clippy lints of the beta 1.82 toolchain.

The `too_long_first_doc_paragraph` lint complained a lot and was
addressed separately: #8941
---
 Dockerfile.build-tools            |  2 +-
 libs/utils/src/logging.rs         |  2 +-
 rust-toolchain.toml               |  4 ++--
 scripts/coverage                  |  4 ++--
 storage_controller/src/service.rs | 10 +++++-----
 workspace_hack/Cargo.toml         |  2 ++
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a9cbed85fb..c4209c7a12 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     export PATH="$HOME/.cargo/bin:$PATH" && \
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
-    rustup component add llvm-tools-preview rustfmt clippy && \
+    rustup component add llvm-tools rustfmt clippy && \
     cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
     cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 71af43a4da..2ea0781667 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -190,7 +190,7 @@ impl Drop for TracingPanicHookGuard {
 }
 
 /// Named symbol for our panic hook, which logs the panic.
-fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
     // following rust 1.66.1 std implementation:
     // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
     let location = info.location();
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index e78c4d6790..3c5d0b12a6 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -3,5 +3,5 @@ channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
-# but we also need `llvm-tools-preview` for coverage data merges on CI
-components = ["llvm-tools-preview", "rustfmt", "clippy"]
+# but we also need `llvm-tools` for coverage data merges on CI
+components = ["llvm-tools", "rustfmt", "clippy"]
diff --git a/scripts/coverage b/scripts/coverage
index 52a69c93b9..482dc58ff6 100755
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -134,7 +134,7 @@ class LLVM:
             # Show a user-friendly warning
             raise Exception(' '.join([
                 f"It appears that you don't have `{name}` installed.",
-                "Please execute `rustup component add llvm-tools-preview`,",
+                "Please execute `rustup component add llvm-tools`,",
                 "or install it via your package manager of choice.",
                 "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
             ]))
@@ -518,7 +518,7 @@ def main() -> None:
     example = f"""
 prerequisites:
     # alternatively, install a system package for `llvm-tools`
-    rustup component add llvm-tools-preview
+    rustup component add llvm-tools
 
 self-contained example:
     {app} run make
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 44fdb474b4..6365423e10 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -451,7 +451,7 @@ struct ShardSplitParams {
 // When preparing for a shard split, we may either choose to proceed with the split,
 // or find that the work is already done and return NoOp.
 enum ShardSplitAction {
-    Split(ShardSplitParams),
+    Split(Box<ShardSplitParams>),
     NoOp(TenantShardSplitResponse),
 }
 
@@ -4186,7 +4186,7 @@ impl Service {
         let policy = policy.unwrap();
         let config = config.unwrap();
 
-        Ok(ShardSplitAction::Split(ShardSplitParams {
+        Ok(ShardSplitAction::Split(Box::new(ShardSplitParams {
             old_shard_count,
             new_shard_count: ShardCount::new(split_req.new_shard_count),
             new_stripe_size: split_req.new_stripe_size,
@@ -4194,13 +4194,13 @@ impl Service {
             policy,
             config,
             shard_ident,
-        }))
+        })))
     }
 
     async fn do_tenant_shard_split(
         &self,
         tenant_id: TenantId,
-        params: ShardSplitParams,
+        params: Box<ShardSplitParams>,
     ) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
@@ -4216,7 +4216,7 @@ impl Service {
             policy,
             config,
             shard_ident,
-        } = params;
+        } = *params;
 
         // Drop any secondary locations: pageservers do not support splitting these, and in any case the
         // end-state for a split tenant will usually be to have secondary locations on different nodes.
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 3d2fa8c214..94f4c0f22f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -8,6 +8,8 @@ version = "0.1.0"
 description = "workspace-hack package, managed by hakari"
 # You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing.
 publish = false
+edition.workspace = true
+license.workspace = true
 # The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments
 # are managed by hakari.
 

From 3dbd34aa78258928344d4de80ddcdcf46b35dfbc Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 7 Sep 2024 00:42:55 +0300
Subject: [PATCH 033/142] feat(storcon): forward gc blocking and unblocking
 (#8956)

Currently using gc blocking and unblocking with storage controller
managed pageservers is painful. Implement the API on storage controller.

Fixes: #8893
---
 pageserver/client/src/lib.rs                  | 18 ++++
 pageserver/client/src/mgmt_api.rs             | 16 ++++
 storage_controller/src/http.rs                | 39 +++++++-
 storage_controller/src/pageserver_client.rs   | 23 ++++-
 storage_controller/src/service.rs             | 54 +++++++++++-
 .../regress/test_timeline_gc_blocking.py      | 88 +++++++++++++++----
 6 files changed, 220 insertions(+), 18 deletions(-)

diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
index 4a3f4dea47..cc8db37173 100644
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1,2 +1,20 @@
 pub mod mgmt_api;
 pub mod page_service;
+
+/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
+// If file structure is per-kind not per-feature then where to put this?
+#[derive(Clone, Copy)]
+pub enum BlockUnblock {
+    Block,
+    Unblock,
+}
+
+impl std::fmt::Display for BlockUnblock {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            BlockUnblock::Block => "block",
+            BlockUnblock::Unblock => "unblock",
+        };
+        f.write_str(s)
+    }
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 737cb00835..a68f45a6d9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -12,6 +12,8 @@ use utils::{
 
 pub use reqwest::Body as ReqwestBody;
 
+use crate::BlockUnblock;
+
 pub mod util;
 
 #[derive(Debug, Clone)]
@@ -454,6 +456,20 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn timeline_block_unblock_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request(Method::POST, &uri, ()).await.map(|_| ())
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 96bdd5039d..a6638f5191 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -21,7 +21,7 @@ use pageserver_api::models::{
     TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
-use pageserver_client::mgmt_api;
+use pageserver_client::{mgmt_api, BlockUnblock};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -369,6 +369,23 @@ async fn handle_tenant_timeline_detach_ancestor(
     json_response(StatusCode::OK, res)
 }
 
+async fn handle_tenant_timeline_block_unblock_gc(
+    service: Arc<Service>,
+    req: Request<Body>,
+    dir: BlockUnblock,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    service
+        .tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1292,6 +1309,26 @@ pub fn make_router(
                 )
             },
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block),
+                    RequestName("v1_tenant_timeline_block_unblock_gc"),
+                )
+            },
+        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock),
+                    RequestName("v1_tenant_timeline_block_unblock_gc"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 20770ed703..961a1f78dd 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -7,7 +7,10 @@ use pageserver_api::{
     },
     shard::TenantShardId,
 };
-use pageserver_client::mgmt_api::{Client, Result};
+use pageserver_client::{
+    mgmt_api::{Client, Result},
+    BlockUnblock,
+};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -258,6 +261,24 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_block_unblock_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<()> {
+        // measuring these makes no sense because we synchronize with the gc loop and remote
+        // storage on block_gc so there should be huge outliers
+        measured_request!(
+            "timeline_block_unblock_gc",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
+                .await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
         measured_request!(
             "utilization",
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6365423e10..be3efaf688 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -69,7 +69,7 @@ use pageserver_api::{
         ValidateResponse, ValidateResponseTenant,
     },
 };
-use pageserver_client::mgmt_api;
+use pageserver_client::{mgmt_api, BlockUnblock};
 use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
 use utils::{
@@ -142,6 +142,7 @@ enum TenantOperations {
     AttachHook,
     TimelineArchivalConfig,
     TimelineDetachAncestor,
+    TimelineGcBlockUnblock,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -3197,6 +3198,57 @@ impl Service {
         }).await?
     }
 
+    pub(crate) async fn tenant_timeline_block_unblock_gc(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineGcBlockUnblock,
+        )
+        .await;
+
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            }
+
+            async fn do_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+                dir: BlockUnblock,
+            ) -> Result<(), ApiError> {
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
+                client
+                    .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
+                    .await
+                    .map_err(|e| passthrough_api_error(&node, e))
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            self.tenant_for_shards(targets, |tenant_shard_id, node| {
+                futures::FutureExt::boxed(do_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                    dir,
+                ))
+            })
+            .await
+        })
+        .await??;
+        Ok(())
+    }
+
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
     ///
     /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index 24de894687..ddfe9b911f 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -1,17 +1,32 @@
 import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import List, Optional
 
+import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LogCursor,
     NeonEnvBuilder,
+    NeonPageserver,
 )
 from fixtures.pageserver.utils import wait_timeline_detail_404
 
 
-def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("sharded", [True, False])
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    neon_env_builder.num_pageservers = 2 if sharded else 1
     env = neon_env_builder.init_start(
-        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"},
+        initial_tenant_shard_count=2 if sharded else None,
     )
-    ps = env.pageserver
-    http = ps.http_client()
+
+    if sharded:
+        http = env.storage_controller.pageserver_api()
+    else:
+        http = env.pageserver.http_client()
+
+    pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers)))
 
     foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
 
@@ -22,9 +37,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
     tenant_before = http.tenant_status(env.initial_tenant)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line)
-
-    assert ps.log_contains(gc_skipped_line, offset) is None
+    pss.assert_log_contains(gc_active_line)
+    pss.assert_log_does_not_contain(gc_skipped_line)
 
     http.timeline_block_gc(env.initial_tenant, foo_branch)
 
@@ -34,34 +48,78 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
     assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
-    ps.restart()
-    ps.quiesce_tenants()
+    pss.restart()
+    pss.quiesce_tenants()
 
-    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+    pss.assert_log_contains(init_gc_skipped)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
     # deletion unblocks gc
     http.timeline_delete(env.initial_tenant, foo_branch)
     wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)
 
     http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
     # removing the manual block also unblocks gc
     http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)
 
 
 def wait_for_another_gc_round():
     time.sleep(2)
+
+
+@dataclass
+class ScrollableLog:
+    pageserver: NeonPageserver
+    offset: Optional[LogCursor]
+
+    def assert_log_contains(self, what: str):
+        msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset)
+        old = self.offset
+        self.offset = offset
+        log.info(f"{old} -> {offset}: {msg}")
+
+    def assert_log_does_not_contain(self, what: str):
+        assert self.pageserver.log_contains(what) is None
+
+
+@dataclass(frozen=True)
+class ManyPageservers:
+    many: List[ScrollableLog]
+
+    def assert_log_contains(self, what: str):
+        for one in self.many:
+            one.assert_log_contains(what)
+
+    def assert_log_does_not_contain(self, what: str):
+        for one in self.many:
+            one.assert_log_does_not_contain(what)
+
+    def restart(self):
+        def do_restart(x: ScrollableLog):
+            x.pageserver.restart()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_restart, self.many)
+            rt.shutdown(wait=True)
+
+    def quiesce_tenants(self):
+        def do_quiesce(x: ScrollableLog):
+            x.pageserver.quiesce_tenants()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_quiesce, self.many)
+            rt.shutdown(wait=True)

From 16c200d6d9f0eaade2efe4ad0f649c3bbf23bf08 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Sat, 7 Sep 2024 00:20:36 +0100
Subject: [PATCH 034/142] push images to prod ACR (#8940)

Used `vars` for new storing non-sensitive information, changed dev
secrets to vars as well but
didn't cleanup any secrets.

https://github.com/neondatabase/cloud/issues/16925

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/actionlint.yml               |  7 ++++
 .github/workflows/_push-to-acr.yml   | 56 ++++++++++++++++++++++++++++
 .github/workflows/build_and_test.yml | 53 +++++++++++++-------------
 3 files changed, 89 insertions(+), 27 deletions(-)
 create mode 100644 .github/workflows/_push-to-acr.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 4ad8a7b460..1b602883c5 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,6 +7,13 @@ self-hosted-runner:
     - small-arm64
     - us-east-2
 config-variables:
+  - AZURE_DEV_CLIENT_ID
+  - AZURE_DEV_REGISTRY_NAME
+  - AZURE_DEV_SUBSCRIPTION_ID
+  - AZURE_PROD_CLIENT_ID
+  - AZURE_PROD_REGISTRY_NAME
+  - AZURE_PROD_SUBSCRIPTION_ID
+  - AZURE_TENANT_ID
   - BENCHMARK_PROJECT_ID_PUB
   - BENCHMARK_PROJECT_ID_SUB
   - REMOTE_STORAGE_AZURE_CONTAINER
diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
new file mode 100644
index 0000000000..415b3d9cc6
--- /dev/null
+++ b/.github/workflows/_push-to-acr.yml
@@ -0,0 +1,56 @@
+name: Push images to ACR
+on:
+  workflow_call:
+    inputs:
+      client_id:
+        description: Client ID of Azure managed identity or Entra app
+        required: true
+        type: string
+      image_tag:
+        description: Tag for the container image
+        required: true
+        type: string
+      images:
+        description: Images to push
+        required: true
+        type: string
+      registry_name:
+        description: Name of the container registry
+        required: true
+        type: string
+      subscription_id:
+        description: Azure subscription ID
+        required: true
+        type: string
+      tenant_id:
+        description: Azure tenant ID
+        required: true
+        type: string
+
+jobs:
+  push-to-acr:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
+
+    steps:
+      - name: Azure login
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ inputs.client_id }}
+          subscription-id: ${{ inputs.subscription_id }}
+          tenant-id: ${{ inputs.tenant_id }}
+
+      - name: Login to ACR
+        run: |
+          az acr login --name=${{ inputs.registry_name }}
+
+      - name: Copy docker images to ACR ${{ inputs.registry_name }}
+        run: |
+          images='${{ inputs.images }}'
+          for image in ${images}; do
+            docker buildx imagetools create \
+              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
+                                        neondatabase/${image}:${{ inputs.image_tag }}
+          done
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ee5fd1b0c6..4bb9e5cb66 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,9 +794,6 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images:
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
@@ -823,28 +820,6 @@ jobs:
                                                neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
           done
 
-      - name: Azure login
-        if: github.ref_name == 'main'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: github.ref_name == 'main'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Copy docker images to ACR-dev
-        if: github.ref_name == 'main'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create \
-              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
-                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
       - name: Add latest tag to images
         if: github.ref_name == 'main'
         run: |
@@ -882,6 +857,30 @@ jobs:
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
 
+  push-to-acr-dev:
+    if: github.ref_name == 'main'
+    needs: [ tag, promote-images ]
+    uses: ./.github/workflows/_push-to-acr.yml
+    with:
+      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      image_tag: ${{ needs.tag.outputs.build-tag }}
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      tenant_id: ${{ vars.AZURE_TENANT_ID }}
+
+  push-to-acr-prod:
+    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    needs: [ tag, promote-images ]
+    uses: ./.github/workflows/_push-to-acr.yml
+    with:
+      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
+      image_tag: ${{ needs.tag.outputs.build-tag }}
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
+      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
+      tenant_id: ${{ vars.AZURE_TENANT_ID }}
+
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
     runs-on: ubuntu-22.04
@@ -957,8 +956,8 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
-    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest

From 7d7d1f354b127ae27ae2e76cb0b3e9a3c8f69d90 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 7 Sep 2024 08:17:25 +0100
Subject: [PATCH 035/142] Fix rust warnings on macOS (#8955)

## Problem
```
error: unused import: `anyhow::Context`
 --> libs/utils/src/crashsafe.rs:8:5
  |
8 | use anyhow::Context;
  |     ^^^^^^^^^^^^^^^
  |
  = note: `-D unused-imports` implied by `-D warnings`
  = help: to override `-D warnings` add `#[allow(unused_imports)]`

error: unused variable: `fd`
   --> libs/utils/src/crashsafe.rs:209:15
    |
209 | pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
    |               ^^ help: if this is intentional, prefix it with an underscore: `_fd`
    |
    = note: `-D unused-variables` implied by `-D warnings`
    = help: to override `-D warnings` add `#[allow(unused_variables)]`
```

## Summary of changes
- Fix rust warnings on macOS
---
 libs/utils/src/crashsafe.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 946fedf6e5..b97c6c7a45 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -5,7 +5,6 @@ use std::{
     io::{self, Write},
 };
 
-use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 /// Similar to [`std::fs::create_dir`], except we fsync the
@@ -206,11 +205,13 @@ pub fn overwrite(
 }
 
 /// Syncs the filesystem for the given file descriptor.
+#[cfg_attr(target_os = "macos", allow(unused_variables))]
 pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
     // Linux guarantees durability for syncfs.
     // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
     #[cfg(target_os = "linux")]
     {
+        use anyhow::Context;
         nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
     }
     #[cfg(target_os = "macos")]

From 93ec7503e08f126e180579f02dcdb6e7a95724ba Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 7 Sep 2024 16:11:36 +0300
Subject: [PATCH 036/142] Lock the correct revision of rust-postgres crates
 (#8960)

We modified the crate in an incompatible way and upgraded to the new
version in PR #8076. However, it was reverted in #8654. The revert
reverted the Cargo.lock reference to it, but since Cargo.toml still
points to the (tip of the) 'neon' branch, every time you make any other
unrelated changes to Cargo.toml, it also tries to update the
rust-postgres crates to the tip of the 'neon' branch again, which
doesn't work.

To fix, lock the crates to the exact commit SHA that works.
---
 Cargo.lock |  8 ++++----
 Cargo.toml | 21 ++++++++++++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 634af67198..cf3031c6d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4121,7 +4121,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4134,7 +4134,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4153,7 +4153,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6409,7 +6409,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 5045ee0d4d..9203920971 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,10 +201,21 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+
+# We want to use the 'neon' branch for these, but there's currently one
+# incompatible change on the branch. See:
+#
+# - PR #8076 which contained changes that depended on the new changes in
+#   the rust-postgres crate, and
+# - PR #8654 which reverted those changes and made the code in proxy incompatible
+#   with the tip of the 'neon' branch again.
+#
+# When those proxy changes are re-applied (see PR #8747), we can switch using
+# the tip of the 'neon' branch again.
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -241,7 +252,7 @@ tonic-build = "0.9"
 [patch.crates-io]
 
 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
 # bug fixes for UUID
 parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }

From 89c5e80b3ff55f0f316aebca0bba497eba7fbec8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:47:23 +0300
Subject: [PATCH 037/142] Update toml and toml_edit crates (#8963)

Eliminates a few duplicate versions from the dependency tree.
---
 Cargo.lock                        | 57 +++++++------------------------
 Cargo.toml                        |  4 +--
 control_plane/src/pageserver.rs   | 10 +++---
 libs/remote_storage/src/config.rs |  2 +-
 libs/utils/src/toml_edit_ext.rs   |  2 +-
 pageserver/ctl/src/main.rs        |  2 +-
 pageserver/src/tenant/config.rs   |  3 +-
 workspace_hack/Cargo.toml         |  2 ++
 8 files changed, 26 insertions(+), 56 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cf3031c6d0..30c9f7e080 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1246,7 +1246,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -1360,8 +1360,8 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-util",
- "toml 0.7.4",
- "toml_edit 0.19.10",
+ "toml",
+ "toml_edit",
  "tracing",
  "url",
  "utils",
@@ -3144,7 +3144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
 dependencies = [
  "serde",
- "toml 0.8.14",
+ "toml",
 ]
 
 [[package]]
@@ -3660,7 +3660,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "utils",
  "workspace_hack",
 ]
@@ -3747,7 +3747,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "twox-hash",
  "url",
@@ -4812,7 +4812,7 @@ dependencies = [
  "tokio",
  "tokio-stream",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "utils",
 ]
@@ -5322,7 +5322,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -6520,18 +6520,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "toml"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
-dependencies = [
- "serde",
- "serde_spanned",
- "toml_datetime",
- "toml_edit 0.19.10",
-]
-
 [[package]]
 name = "toml"
 version = "0.8.14"
@@ -6541,7 +6529,7 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "toml_edit 0.22.14",
+ "toml_edit",
 ]
 
 [[package]]
@@ -6553,19 +6541,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "toml_edit"
-version = "0.19.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
-dependencies = [
- "indexmap 1.9.3",
- "serde",
- "serde_spanned",
- "toml_datetime",
- "winnow 0.4.6",
-]
-
 [[package]]
 name = "toml_edit"
 version = "0.22.14"
@@ -6576,7 +6551,7 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "winnow 0.6.13",
+ "winnow",
 ]
 
 [[package]]
@@ -6989,7 +6964,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
@@ -7535,15 +7510,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
 
-[[package]]
-name = "winnow"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "winnow"
 version = "0.6.13"
@@ -7651,6 +7617,7 @@ dependencies = [
  "tokio",
  "tokio-rustls 0.24.0",
  "tokio-util",
+ "toml_edit",
  "tonic",
  "tower",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index 9203920971..107cd6cd44 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -177,8 +177,8 @@ tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
-toml = "0.7"
-toml_edit = "0.19"
+toml = "0.8"
+toml_edit = "0.22"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 31777eb7a5..33ca70af96 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -75,14 +75,14 @@ impl PageServerNode {
         }
     }
 
-    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
-        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
+        toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
     }
 
     fn pageserver_init_make_toml(
         &self,
         conf: NeonLocalInitPageserverConf,
-    ) -> anyhow::Result<toml_edit::Document> {
+    ) -> anyhow::Result<toml_edit::DocumentMut> {
         assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
 
         // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
@@ -137,9 +137,9 @@ impl PageServerNode {
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
-        let mut config_toml = toml_edit::Document::new();
+        let mut config_toml = toml_edit::DocumentMut::new();
         for fragment_str in overrides {
-            let fragment = toml_edit::Document::from_str(&fragment_str)
+            let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
                 .expect("all fragments in `overrides` are valid toml documents, this function controls that");
             for (key, item) in fragment.iter() {
                 config_toml.insert(key, item.clone());
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index f819a1572a..d0e92411da 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -185,7 +185,7 @@ mod tests {
     use super::*;
 
     fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
-        let toml = input.parse::<toml_edit::Document>().unwrap();
+        let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
         RemoteStorageConfig::from_toml(toml.as_item())
     }
 
diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs
index ab5f7bdd95..1359e27b77 100644
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
 where
     T: serde::de::DeserializeOwned,
 {
-    let document: toml_edit::Document = match item {
+    let document: toml_edit::DocumentMut = match item {
         toml_edit::Item::Table(toml) => toml.clone().into(),
         toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
             toml.clone().into_table().into()
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3b66b0c4aa..cf001ef0d5 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
                 println!("specified prefix '{}' failed validation", cmd.prefix);
                 return Ok(());
             };
-            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
+            let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
             let toml_item = toml_document
                 .get("remote_storage")
                 .expect("need remote_storage");
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 7e0344666b..547b43a399 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -452,7 +452,8 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
                     .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
             }
             toml_edit::Item::Table(table) => {
-                let deserializer = toml_edit::de::Deserializer::new(table.into());
+                let deserializer =
+                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
                 return serde_path_to_error::deserialize(deserializer)
                     .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
             }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 94f4c0f22f..411ca81032 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -83,6 +83,7 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
+toml_edit = { version = "0.22", features = ["serde"] }
 tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
@@ -126,6 +127,7 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+toml_edit = { version = "0.22", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 2d885ac07ae0207ab886fd4dda84701ae33893f1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:47:57 +0300
Subject: [PATCH 038/142] Update strum (#8962)

I wanted to use some features from the newer version. The PR that needed
the new version is not ready yet (and might never be), but seems nice to
stay up in any case.
---
 Cargo.lock                        | 40 ++++++++++---------------------
 Cargo.toml                        |  6 ++---
 libs/pageserver_api/src/models.rs |  2 +-
 libs/utils/src/logging.rs         |  6 ++---
 pageserver/src/metrics.rs         |  4 ++--
 5 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 30c9f7e080..4fb3ac7223 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1189,9 +1189,9 @@ dependencies = [
 
 [[package]]
 name = "comfy-table"
-version = "6.1.4"
+version = "7.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
+checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
 dependencies = [
  "crossterm",
  "strum",
@@ -1485,25 +1485,22 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
 
 [[package]]
 name = "crossterm"
-version = "0.25.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
+checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "crossterm_winapi",
  "libc",
- "mio",
  "parking_lot 0.12.1",
- "signal-hook",
- "signal-hook-mio",
  "winapi",
 ]
 
 [[package]]
 name = "crossterm_winapi"
-version = "0.9.0"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
+checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
 dependencies = [
  "winapi",
 ]
@@ -5731,17 +5728,6 @@ dependencies = [
  "signal-hook-registry",
 ]
 
-[[package]]
-name = "signal-hook-mio"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
-dependencies = [
- "libc",
- "mio",
- "signal-hook",
-]
-
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.1"
@@ -6054,21 +6040,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
 [[package]]
 name = "strum"
-version = "0.24.1"
+version = "0.26.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
 
 [[package]]
 name = "strum_macros"
-version = "0.24.3"
+version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck 0.4.1",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 107cd6cd44..40e399619d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -73,7 +73,7 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-comfy-table = "6.1"
+comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-deque = "0.8.5"
@@ -158,8 +158,8 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
-strum = "0.24"
-strum_macros = "0.24"
+strum = "0.26"
+strum_macros = "0.26"
 "subtle"  = "2.5.0"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ffe79c8350..45e84baa1f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
     serde::Serialize,
     serde::Deserialize,
     strum_macros::Display,
-    strum_macros::EnumVariantNames,
+    strum_macros::VariantNames,
     strum_macros::AsRefStr,
     strum_macros::IntoStaticStr,
 )]
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 2ea0781667..e205d60d74 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -3,11 +3,9 @@ use std::str::FromStr;
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use strum_macros::{EnumString, EnumVariantNames};
+use strum_macros::{EnumString, VariantNames};
 
-#[derive(
-    EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
-)]
+#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
     Plain,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c4011d593c..9197505876 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,7 +9,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, VariantNames};
-use strum_macros::{EnumVariantNames, IntoStaticStr};
+use strum_macros::{IntoStaticStr, VariantNames};
 use tracing::warn;
 use utils::id::TimelineId;
 
@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];
 
 // Metrics collected on operations on the storage repository.
-#[derive(Debug, EnumVariantNames, IntoStaticStr)]
+#[derive(Debug, VariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]

From c8f67eed8f0e3ed182ebe85753389ae5b1c161ea Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 9 Sep 2024 10:34:56 +0300
Subject: [PATCH 039/142] Remove TEST_SHARED_FIXTURES (#8965)

I wish it worked, but it's been broken for a long time, so let's admit
defeat and remove it.

The idea of sharing the same pageserver and safekeeper environment
between tests is still sound, and it could save a lot of time in our
CI. We should perhaps put some time into doing that, but we're better
off starting from scratch than trying to make TEST_SHARED_FIXTURES
work in its current form.
---
 test_runner/README.md                 | 12 ++---
 test_runner/fixtures/neon_fixtures.py | 68 +++------------------------
 2 files changed, 10 insertions(+), 70 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index 73aa29d4bb..647b930b26 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -18,8 +18,7 @@ Prerequisites:
 
 Regression tests are in the 'regress' directory. They can be run in
 parallel to minimize total runtime. Most regression test sets up their
-environment with its own pageservers and safekeepers (but see
-`TEST_SHARED_FIXTURES`).
+environment with its own pageservers and safekeepers.
 
 'pg_clients' contains tests for connecting with various client
 libraries. Each client test uses a Dockerfile that pulls an image that
@@ -74,7 +73,6 @@ This is used to construct full path to the postgres binaries.
 Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
-`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
 `RUST_LOG`: logging configuration to pass into Neon CLI
 
 Useful parameters and commands:
@@ -259,11 +257,9 @@ compute Postgres nodes. The connections between them can be configured to use JW
 authentication tokens, and some other configuration options can be tweaked too.
 
 The easiest way to get access to a Neon Environment is by using the `neon_simple_env`
-fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
-or make other destructive changes in that environment. Also don't assume that
-there are no tenants or branches or data in the cluster. For convenience, there is a
-branch called `empty`, though. The convention is to create a test-specific branch of
-that and load any test data there, instead of the 'main' branch.
+fixture. For convenience, there is a branch called `empty` in environments created with
+'neon_simple_env'. The convention is to create a test-specific branch of that and load any
+test data there, instead of the 'main' branch.
 
 For more complicated cases, you can build a custom Neon Environment, with the `neon_env`
 fixture:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5a600dd0a1..3047dcc4f7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -221,33 +221,6 @@ def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI:
     return NeonAPI(neon_api_key, neon_api_base_url)
 
 
-def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
-    """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
-
-    This function can be used as a scope like this:
-    @pytest.fixture(scope=shareable_scope)
-    def myfixture(...)
-       ...
-    """
-    scope: Literal["session", "function"]
-
-    if os.environ.get("TEST_SHARED_FIXTURES") is None:
-        # Create the environment in the per-test output directory
-        scope = "function"
-    elif (
-        os.environ.get("BUILD_TYPE") is not None
-        and os.environ.get("DEFAULT_PG_VERSION") is not None
-    ):
-        scope = "session"
-    else:
-        pytest.fail(
-            "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set",
-            pytrace=False,
-        )
-
-    return scope
-
-
 @pytest.fixture(scope="session")
 def worker_port_num():
     return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
@@ -1431,8 +1404,8 @@ class NeonEnv:
         return "ep-" + str(self.endpoint_counter)
 
 
-@pytest.fixture(scope=shareable_scope)
-def _shared_simple_env(
+@pytest.fixture(scope="function")
+def neon_simple_env(
     request: FixtureRequest,
     pytestconfig: Config,
     port_distributor: PortDistributor,
@@ -1450,19 +1423,13 @@ def _shared_simple_env(
     pageserver_io_buffer_alignment: Optional[int],
 ) -> Iterator[NeonEnv]:
     """
-    # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
-     is set, this is shared by all tests using `neon_simple_env`.
+    Simple Neon environment, with no authentication and no safekeepers.
 
     This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
     """
 
-    if os.environ.get("TEST_SHARED_FIXTURES") is None:
-        # Create the environment in the per-test output directory
-        repo_dir = get_test_repo_dir(request, top_output_dir)
-    else:
-        # We're running shared fixtures. Share a single directory.
-        repo_dir = top_output_dir / "shared_repo"
-        shutil.rmtree(repo_dir, ignore_errors=True)
+    # Create the environment in the per-test output directory
+    repo_dir = get_test_repo_dir(request, top_output_dir)
 
     with NeonEnvBuilder(
         top_output_dir=top_output_dir,
@@ -1489,22 +1456,6 @@ def _shared_simple_env(
 
         yield env
 
-
-@pytest.fixture(scope="function")
-def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
-    """
-    Simple Neon environment, with no authentication and no safekeepers.
-
-    If TEST_SHARED_FIXTURES environment variable is set, we reuse the same
-    environment for all tests that use 'neon_simple_env', keeping the
-    page server and safekeepers running. Any compute nodes are stopped after
-    each the test, however.
-    """
-    yield _shared_simple_env
-
-    _shared_simple_env.endpoints.stop_all()
-
-
 @pytest.fixture(scope="function")
 def neon_env_builder(
     pytestconfig: Config,
@@ -4898,14 +4849,7 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
 
 
 # This is autouse, so the test output directory always gets created, even
-# if a test doesn't put anything there. It also solves a problem with the
-# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it
-# creates the repo in the test output directory. But it cannot depend on
-# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set,
-# it has 'session' scope and cannot access fixtures with 'function'
-# scope. So it uses the get_test_output_dir() function to get the path, and
-# this fixture ensures that the directory exists.  That works because
-# 'autouse' fixtures are run before other fixtures.
+# if a test doesn't put anything there.
 #
 # NB: we request the overlay dir fixture so the fixture does its cleanups
 @pytest.fixture(scope="function", autouse=True)

From 723c0971e818848696984fd66c562c9d0cbff948 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 9 Sep 2024 10:35:12 +0300
Subject: [PATCH 040/142] Don't create 'empty' branch in neon_simple_env
 (#8965)

Now that we've given up hope on sharing the neon_simple_env between
tests, there's no reason to not use the 'main' branch directly.
---
 test_runner/README.md                         |  5 ++--
 test_runner/fixtures/neon_fixtures.py         |  5 +---
 .../performance/test_logical_replication.py   |  4 +--
 test_runner/regress/test_basebackup_error.py  |  3 +--
 test_runner/regress/test_clog_truncate.py     |  5 ++--
 test_runner/regress/test_compute_catalog.py   |  3 +--
 test_runner/regress/test_config.py            |  3 +--
 test_runner/regress/test_createdropdb.py      | 17 ++++---------
 test_runner/regress/test_createuser.py        |  5 ++--
 test_runner/regress/test_ddl_forwarding.py    |  3 +--
 .../regress/test_explain_with_lfc_stats.py    |  6 ++---
 test_runner/regress/test_lfc_resize.py        |  3 +--
 .../test_lfc_working_set_approximation.py     |  6 ++---
 test_runner/regress/test_local_file_cache.py  |  7 ++----
 .../regress/test_logical_replication.py       | 11 +++-----
 test_runner/regress/test_migrations.py        |  3 +--
 test_runner/regress/test_multixact.py         |  7 +++---
 test_runner/regress/test_neon_superuser.py    |  2 +-
 test_runner/regress/test_parallel_copy.py     |  3 +--
 .../regress/test_pg_query_cancellation.py     |  4 +--
 test_runner/regress/test_pg_waldump.py        |  4 +--
 test_runner/regress/test_read_validation.py   | 12 ++-------
 test_runner/regress/test_readonly_node.py     | 25 ++++++++-----------
 test_runner/regress/test_subxacts.py          |  3 +--
 test_runner/regress/test_timeline_delete.py   |  7 ++++--
 test_runner/regress/test_timeline_size.py     |  4 +--
 test_runner/regress/test_twophase.py          |  7 ++----
 test_runner/regress/test_unlogged.py          |  5 ++--
 test_runner/regress/test_vm_bits.py           |  5 ++--
 test_runner/test_broken.py                    |  3 +--
 30 files changed, 65 insertions(+), 115 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index 647b930b26..d754e60d17 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -257,9 +257,8 @@ compute Postgres nodes. The connections between them can be configured to use JW
 authentication tokens, and some other configuration options can be tweaked too.
 
 The easiest way to get access to a Neon Environment is by using the `neon_simple_env`
-fixture. For convenience, there is a branch called `empty` in environments created with
-'neon_simple_env'. The convention is to create a test-specific branch of that and load any
-test data there, instead of the 'main' branch.
+fixture. For convenience, there is a branch called `main` in environments created with
+'neon_simple_env', ready to be used in the test.
 
 For more complicated cases, you can build a custom Neon Environment, with the `neon_env`
 fixture:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3047dcc4f7..60887b9aed 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -57,7 +57,6 @@ from _pytest.fixtures import FixtureRequest
 from psycopg2.extensions import connection as PgConnection
 from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
-from typing_extensions import Literal
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
@@ -1451,11 +1450,9 @@ def neon_simple_env(
     ) as builder:
         env = builder.init_start()
 
-        # For convenience in tests, create a branch from the freshly-initialized cluster.
-        env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
-
         yield env
 
+
 @pytest.fixture(scope="function")
 def neon_env_builder(
     pytestconfig: Config,
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 077f73ac06..29a0380524 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -22,10 +22,8 @@ if TYPE_CHECKING:
 def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start("test_logical_replication")
+    endpoint = env.endpoints.create_start("main")
 
-    log.info("postgres is running on 'test_logical_replication' branch")
     pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
 
     endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py
index 170b494884..13c080ea0e 100644
--- a/test_runner/regress/test_basebackup_error.py
+++ b/test_runner/regress/test_basebackup_error.py
@@ -8,11 +8,10 @@ from fixtures.neon_fixtures import NeonEnv
 #
 def test_basebackup_error(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_basebackup_error", "empty")
     pageserver_http = env.pageserver.http_client()
 
     # Introduce failpoint
     pageserver_http.configure_failpoints(("basebackup-before-control-file", "return"))
 
     with pytest.raises(Exception, match="basebackup-before-control-file"):
-        env.endpoints.create_start("test_basebackup_error")
+        env.endpoints.create_start("main")
diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py
index 26e6e336b9..6e4880841a 100644
--- a/test_runner/regress/test_clog_truncate.py
+++ b/test_runner/regress/test_clog_truncate.py
@@ -11,7 +11,6 @@ from fixtures.utils import query_scalar
 #
 def test_clog_truncate(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_clog_truncate", "empty")
 
     # set aggressive autovacuum to make sure that truncation will happen
     config = [
@@ -24,7 +23,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
         "autovacuum_freeze_max_age=100000",
     ]
 
-    endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config)
+    endpoint = env.endpoints.create_start("main", config_lines=config)
 
     # Install extension containing function needed for test
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
@@ -58,7 +57,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     # create new branch after clog truncation and start a compute node on it
     log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}")
     env.neon_cli.create_branch(
-        "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation
+        "test_clog_truncate_new", "main", ancestor_start_lsn=lsn_after_truncation
     )
     endpoint2 = env.endpoints.create_start("test_clog_truncate_new")
 
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index dd36190fcd..8b8c970357 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -4,9 +4,8 @@ from fixtures.neon_fixtures import NeonEnv
 
 def test_compute_catalog(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_config", "empty")
 
-    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
     client = endpoint.http_client()
 
     objects = client.dbs_and_roles()
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 2ef28eb94b..d8ef0b8dbd 100644
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -9,10 +9,9 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 #
 def test_config(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_config", "empty")
 
     # change config
-    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
 
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py
index f741a9fc87..af643f45d7 100644
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -17,9 +17,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
     if env.pg_version == PgVersion.V14 and strategy == "wal_log":
         pytest.skip("wal_log strategy not supported on PostgreSQL 14")
 
-    env.neon_cli.create_branch("test_createdb", "empty")
-
-    endpoint = env.endpoints.create_start("test_createdb")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -33,7 +31,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
         lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create a branch
-    env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch("test_createdb2", "main", ancestor_start_lsn=lsn)
     endpoint2 = env.endpoints.create_start("test_createdb2")
 
     # Test that you can connect to the new database on both branches
@@ -62,8 +60,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
 #
 def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_dropdb", "empty")
-    endpoint = env.endpoints.create_start("test_dropdb")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         cur.execute("CREATE DATABASE foodb")
@@ -80,14 +77,10 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
         lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create two branches before and after database drop.
-    env.neon_cli.create_branch(
-        "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop
-    )
+    env.neon_cli.create_branch("test_before_dropdb", "main", ancestor_start_lsn=lsn_before_drop)
     endpoint_before = env.endpoints.create_start("test_before_dropdb")
 
-    env.neon_cli.create_branch(
-        "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop
-    )
+    env.neon_cli.create_branch("test_after_dropdb", "main", ancestor_start_lsn=lsn_after_drop)
     endpoint_after = env.endpoints.create_start("test_after_dropdb")
 
     # Test that database exists on the branch before drop
diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py
index 17d9824f52..d6f138e126 100644
--- a/test_runner/regress/test_createuser.py
+++ b/test_runner/regress/test_createuser.py
@@ -7,8 +7,7 @@ from fixtures.utils import query_scalar
 #
 def test_createuser(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_createuser", "empty")
-    endpoint = env.endpoints.create_start("test_createuser")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -19,7 +18,7 @@ def test_createuser(neon_simple_env: NeonEnv):
         lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create a branch
-    env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch("test_createuser2", "main", ancestor_start_lsn=lsn)
     endpoint2 = env.endpoints.create_start("test_createuser2")
 
     # Test that you can connect to new branch as a new user
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 50da673d87..65f310c27a 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -290,9 +290,8 @@ def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str):
 # Here we test the latter. The first one is tested in test_ddl_forwarding
 def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty")
     endpoint = env.endpoints.create_start(
-        "test_ddl_forwarding_invalid_db",
+        "main",
         # Some non-existent url
         config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"],
     )
diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py
index 5231dedcda..0217c9ac7b 100644
--- a/test_runner/regress/test_explain_with_lfc_stats.py
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -10,11 +10,9 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
     cache_dir = Path(env.repo_dir) / "file_cache"
     cache_dir.mkdir(exist_ok=True)
 
-    branchname = "test_explain_with_lfc_stats"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
     endpoint = env.endpoints.create_start(
-        branchname,
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 1b2c7f808f..cb0b30d9c6 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -16,9 +16,8 @@ from fixtures.neon_fixtures import NeonEnv, PgBin
 @pytest.mark.timeout(600)
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_lfc_resize", "empty")
     endpoint = env.endpoints.create_start(
-        "test_lfc_resize",
+        "main",
         config_lines=[
             "neon.file_cache_path='file.cache'",
             "neon.max_file_cache_size=512MB",
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index 4c53e4e2fd..4a3a949d1a 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -12,11 +12,9 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
     cache_dir = Path(env.repo_dir) / "file_cache"
     cache_dir.mkdir(exist_ok=True)
 
-    branchname = "test_approximate_working_set_size"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
     endpoint = env.endpoints.create_start(
-        branchname,
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 3c404c3b23..9c38200937 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -5,7 +5,7 @@ import threading
 import time
 from typing import List
 
-from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.utils import query_scalar
 
 
@@ -15,11 +15,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
     cache_dir = os.path.join(env.repo_dir, "file_cache")
     os.mkdir(cache_dir)
 
-    env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
-    env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
-
     endpoint = env.endpoints.create_start(
-        "test_local_file_cache_unlink",
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index f83a833dda..15a3719e0b 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -36,10 +36,8 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_logical_replication", config_lines=["log_statement=all"]
-    )
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"])
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -185,10 +183,9 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
 
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_logical_replication", "empty")
     # set low neon.logical_replication_max_snap_files
     endpoint = env.endpoints.create_start(
-        "test_logical_replication",
+        "main",
         config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"],
     )
 
@@ -472,7 +469,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env
-    env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty")
+    env.neon_cli.create_branch("test_replication_shutdown_publisher", "main")
     pub = env.endpoints.create("test_replication_shutdown_publisher")
 
     env.neon_cli.create_branch("test_replication_shutdown_subscriber")
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index bdc5ca907e..e88e56d030 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -9,9 +9,8 @@ if TYPE_CHECKING:
 
 def test_migrations(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_migrations", "empty")
 
-    endpoint = env.endpoints.create("test_migrations")
+    endpoint = env.endpoints.create("main")
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py
index 88f7a5db59..8a00f8835f 100644
--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -14,8 +14,7 @@ from fixtures.utils import query_scalar
 #
 def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_multixact", "empty")
-    endpoint = env.endpoints.create_start("test_multixact")
+    endpoint = env.endpoints.create_start("main")
 
     cur = endpoint.connect().cursor()
     cur.execute(
@@ -73,7 +72,9 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     assert int(next_multixact_id) > int(next_multixact_id_old)
 
     # Branch at this point
-    env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch(
+        "test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn
+    )
     endpoint_new = env.endpoints.create_start("test_multixact_new")
 
     next_multixact_id_new = endpoint_new.safe_psql(
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index fd31df84da..7825ec772c 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -6,7 +6,7 @@ from fixtures.utils import wait_until
 
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
+    env.neon_cli.create_branch("test_neon_superuser_publisher", "main")
     pub = env.endpoints.create("test_neon_superuser_publisher")
 
     env.neon_cli.create_branch("test_neon_superuser_subscriber")
diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py
index b33e387a66..a5037e8694 100644
--- a/test_runner/regress/test_parallel_copy.py
+++ b/test_runner/regress/test_parallel_copy.py
@@ -41,8 +41,7 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int):
 # Load data into one table with COPY TO from 5 parallel connections
 def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_parallel_copy", "empty")
-    endpoint = env.endpoints.create_start("test_parallel_copy")
+    endpoint = env.endpoints.create_start("main")
 
     # Create test table
     conn = endpoint.connect()
diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py
index bad2e5865e..c6b4eff516 100644
--- a/test_runner/regress/test_pg_query_cancellation.py
+++ b/test_runner/regress/test_pg_query_cancellation.py
@@ -42,11 +42,9 @@ def test_cancellations(neon_simple_env: NeonEnv):
     ps_http = ps.http_client()
     ps_http.is_testing_enabled_or_skip()
 
-    env.neon_cli.create_branch("test_config", "empty")
-
     # We don't want to have any racy behaviour with autovacuum IOs
     ep = env.endpoints.create_start(
-        "test_config",
+        "main",
         config_lines=[
             "autovacuum = off",
             "shared_buffers = 128MB",
diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
index 8e80efd9ba..1990d69b6a 100644
--- a/test_runner/regress/test_pg_waldump.py
+++ b/test_runner/regress/test_pg_waldump.py
@@ -22,8 +22,8 @@ def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir):
 def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
     env = neon_simple_env
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty")
-    endpoint = env.endpoints.create_start("test_pg_waldump")
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main")
 
     cur = endpoint.connect().cursor()
     cur.execute(
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 1ac881553f..78798c5abf 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -15,12 +15,8 @@ extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"]
 #
 def test_read_validation(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_read_validation", "empty")
-
-    endpoint = env.endpoints.create_start(
-        "test_read_validation",
-    )
 
+    endpoint = env.endpoints.create_start("main")
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
             for e in extensions:
@@ -131,13 +127,9 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
 def test_read_validation_neg(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_read_validation_neg", "empty")
-
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
-    endpoint = env.endpoints.create_start(
-        "test_read_validation_neg",
-    )
+    endpoint = env.endpoints.create_start("main")
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 368f60127e..347fc3a04d 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -22,8 +22,7 @@ from fixtures.utils import query_scalar
 #
 def test_readonly_node(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_readonly_node", "empty")
-    endpoint_main = env.endpoints.create_start("test_readonly_node")
+    endpoint_main = env.endpoints.create_start("main")
 
     env.pageserver.allowed_errors.extend(
         [
@@ -74,12 +73,12 @@ def test_readonly_node(neon_simple_env: NeonEnv):
 
     # Create first read-only node at the point where only 100 rows were inserted
     endpoint_hundred = env.endpoints.create_start(
-        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a
+        branch_name="main", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a
     )
 
     # And another at the point where 200100 rows were inserted
     endpoint_more = env.endpoints.create_start(
-        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b
+        branch_name="main", endpoint_id="ep-readonly_node_more", lsn=lsn_b
     )
 
     # On the 'hundred' node, we should see only 100 rows
@@ -100,7 +99,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
 
     # Check creating a node at segment boundary
     endpoint = env.endpoints.create_start(
-        branch_name="test_readonly_node",
+        branch_name="main",
         endpoint_id="ep-branch_segment_boundary",
         lsn=Lsn("0/3000000"),
     )
@@ -112,7 +111,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     with pytest.raises(Exception, match="invalid basebackup lsn"):
         # compute node startup with invalid LSN should fail
         env.endpoints.create_start(
-            branch_name="test_readonly_node",
+            branch_name="main",
             endpoint_id="ep-readonly_node_preinitdb",
             lsn=Lsn("0/42"),
         )
@@ -218,14 +217,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 # Similar test, but with more data, and we force checkpoints
 def test_timetravel(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    pageserver_http_client = env.pageserver.http_client()
-    env.neon_cli.create_branch("test_timetravel", "empty")
-    endpoint = env.endpoints.create_start("test_timetravel")
-
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
     client = env.pageserver.http_client()
-
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    endpoint = env.endpoints.create_start("main")
 
     lsns = []
 
@@ -249,7 +244,7 @@ def test_timetravel(neon_simple_env: NeonEnv):
         wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
 
         # run checkpoint manually to force a new layer file
-        pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id)
 
     ##### Restart pageserver
     env.endpoints.stop_all()
@@ -258,7 +253,7 @@ def test_timetravel(neon_simple_env: NeonEnv):
 
     for i, lsn in lsns:
         endpoint_old = env.endpoints.create_start(
-            branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn
+            branch_name="main", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn
         )
         with endpoint_old.cursor() as cur:
             assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 10cb00c780..82075bd723 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -9,8 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 # CLOG.
 def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_subxacts", "empty")
-    endpoint = env.endpoints.create_start("test_subxacts")
+    endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 328131cd08..711fcd5016 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -68,10 +68,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
 
     # construct pair of branches to validate that pageserver prohibits
     # deletion of ancestor timelines when they have child branches
-    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty")
+    parent_timeline_id = env.neon_cli.create_branch(
+        new_branch_name="test_ancestor_branch_delete_parent", ancestor_branch_name="main"
+    )
 
     leaf_timeline_id = env.neon_cli.create_branch(
-        "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
+        new_branch_name="test_ancestor_branch_delete_branch1",
+        ancestor_branch_name="test_ancestor_branch_delete_parent",
     )
 
     timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 9bf5f8680b..f2265dd3d9 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -36,7 +36,7 @@ from fixtures.utils import get_timeline_dir_size, wait_until
 
 def test_timeline_size(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "main")
 
     client = env.pageserver.http_client()
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
@@ -68,7 +68,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
 
 def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "main")
 
     client = env.pageserver.http_client()
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index dd76689008..ea900b07b8 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -9,10 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
 #
 def test_twophase(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_twophase", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_twophase", config_lines=["max_prepared_transactions=5"]
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"])
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -56,7 +53,7 @@ def test_twophase(neon_simple_env: NeonEnv):
     assert len(twophase_files) == 2
 
     # Create a branch with the transaction in prepared state
-    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase")
+    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main")
 
     # Start compute on the new branch
     endpoint2 = env.endpoints.create_start(
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index 137d28b9fa..deba29536c 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -9,8 +9,7 @@ from fixtures.pg_version import PgVersion
 #
 def test_unlogged(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_unlogged", "empty")
-    endpoint = env.endpoints.create_start("test_unlogged")
+    endpoint = env.endpoints.create_start("main")
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -22,7 +21,7 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur.execute("INSERT INTO iut (id) values (42);")
 
     # create another compute to fetch inital empty contents from pageserver
-    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
+    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "main")
     endpoint2 = env.endpoints.create_start("test_unlogged_basebackup")
 
     conn2 = endpoint2.connect()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 7272979c4a..3075211ada 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -13,8 +13,7 @@ from fixtures.utils import query_scalar
 def test_vm_bit_clear(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_vm_bit_clear", "empty")
-    endpoint = env.endpoints.create_start("test_vm_bit_clear")
+    endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -58,7 +57,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1")
 
     # Branch at this point, to test that later
-    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
+    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "main")
 
     # Clear the buffer cache, to force the VM page to be re-fetched from
     # the page server
diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py
index 7e8aef5a5f..d710b53528 100644
--- a/test_runner/test_broken.py
+++ b/test_runner/test_broken.py
@@ -23,8 +23,7 @@ run_broken = pytest.mark.skipif(
 def test_broken(neon_simple_env: NeonEnv, pg_bin):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_broken", "empty")
-    env.endpoints.create_start("test_broken")
+    env.endpoints.create_start("main")
     log.info("postgres is running")
 
     log.info("THIS NEXT COMMAND WILL FAIL:")

From e158df4e86318fa3fd5ee9516f3e7ac91dd14283 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:03:27 +0800
Subject: [PATCH 041/142] feat(pageserver): split delta writer automatically
 determines key range (#8850)

close https://github.com/neondatabase/neon/issues/8838

## Summary of changes

This patch modifies the split delta layer writer to avoid taking
start_key and end_key when creating/finishing the layer writer. The
start_key for the delta layers will be the first key provided to the
layer writer, and the end_key would be the `last_key.next()`. This
simplifies the delta layer writer API.

On that, the layer key hack is removed. Image layers now use the full
key range, and delta layers use the first/last key provided by the user.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs                |   9 -
 pageserver/src/tenant.rs                      |   6 +-
 .../src/tenant/storage_layer/split_writer.rs  | 158 ++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs  |   7 +-
 4 files changed, 109 insertions(+), 71 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 77d744e4da..8929ccb41d 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -263,15 +263,6 @@ impl Key {
         field5: u8::MAX,
         field6: u32::MAX,
     };
-    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
-    pub const NON_L0_MAX: Key = Key {
-        field1: u8::MAX,
-        field2: u32::MAX,
-        field3: u32::MAX,
-        field4: u32::MAX,
-        field5: u8::MAX,
-        field6: u32::MAX - 1,
-    };
 
     pub fn from_hex(s: &str) -> Result<Self> {
         if s.len() != 36 {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fd2520a42e..c6f0e48101 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7091,13 +7091,13 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: Key::MIN..Key::MAX,
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
-                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
+                // The delta layer below the horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: get_key(3)..get_key(4),
                     lsn_range: Lsn(0x30)..Lsn(0x48),
                     is_delta: true
                 },
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 7c1ac863bf..40a6a77a50 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -188,7 +188,7 @@ impl SplitImageLayerWriter {
             .await
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    /// This function will be deprecated with #8841.
     pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
         Ok((self.generated_layers, self.inner))
     }
@@ -204,7 +204,7 @@ impl SplitImageLayerWriter {
 /// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
+    inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
     generated_layers: Vec<SplitWriterResult>,
     conf: &'static PageServerConf,
@@ -212,7 +212,6 @@ pub struct SplitDeltaLayerWriter {
     tenant_shard_id: TenantShardId,
     lsn_range: Range<Lsn>,
     last_key_written: Key,
-    start_key: Key,
 }
 
 impl SplitDeltaLayerWriter {
@@ -220,29 +219,18 @@ impl SplitDeltaLayerWriter {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        start_key: Key,
         lsn_range: Range<Lsn>,
         target_layer_size: u64,
-        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
+            inner: None,
             generated_layers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
             lsn_range,
             last_key_written: Key::MIN,
-            start_key,
         })
     }
 
@@ -265,9 +253,26 @@ impl SplitDeltaLayerWriter {
         //
         // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
         // strategy. https://github.com/neondatabase/neon/issues/8837
+
+        if self.inner.is_none() {
+            self.inner = Some((
+                key,
+                DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    key,
+                    self.lsn_range.clone(),
+                    ctx,
+                )
+                .await?,
+            ));
+        }
+        let (_, inner) = self.inner.as_mut().unwrap();
+
         let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        if inner.num_keys() >= 1
+            && inner.estimated_size() + addition_size_estimation >= self.target_layer_size
         {
             if key != self.last_key_written {
                 let next_delta_writer = DeltaLayerWriter::new(
@@ -279,13 +284,13 @@ impl SplitDeltaLayerWriter {
                     ctx,
                 )
                 .await?;
-                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+                let (start_key, prev_delta_writer) =
+                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
                 let layer_key = PersistentLayerKey {
-                    key_range: self.start_key..key,
+                    key_range: start_key..key,
                     lsn_range: self.lsn_range.clone(),
                     is_delta: true,
                 };
-                self.start_key = key;
                 if discard(&layer_key).await {
                     drop(prev_delta_writer);
                     self.generated_layers
@@ -296,17 +301,18 @@ impl SplitDeltaLayerWriter {
                     self.generated_layers
                         .push(SplitWriterResult::Produced(delta_layer));
                 }
-            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
+            } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
                 anyhow::bail!(
                     "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
                     key,
-                    self.inner.estimated_size()
+                    inner.estimated_size()
                 );
             }
         }
         self.last_key_written = key;
-        self.inner.put_value(key, lsn, val, ctx).await
+        let (_, inner) = self.inner.as_mut().unwrap();
+        inner.put_value(key, lsn, val, ctx).await
     }
 
     pub async fn put_value(
@@ -325,7 +331,6 @@ impl SplitDeltaLayerWriter {
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        end_key: Key,
         discard: D,
     ) -> anyhow::Result<Vec<SplitWriterResult>>
     where
@@ -337,11 +342,15 @@ impl SplitDeltaLayerWriter {
             inner,
             ..
         } = self;
+        let Some((start_key, inner)) = inner else {
+            return Ok(generated_layers);
+        };
         if inner.num_keys() == 0 {
             return Ok(generated_layers);
         }
+        let end_key = self.last_key_written.next();
         let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
+            key_range: start_key..end_key,
             lsn_range: self.lsn_range.clone(),
             is_delta: true,
         };
@@ -360,15 +369,14 @@ impl SplitDeltaLayerWriter {
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        end_key: Key,
     ) -> anyhow::Result<Vec<SplitWriterResult>> {
-        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+        self.finish_with_discard_fn(tline, ctx, |_| async { false })
             .await
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
+    /// This function will be deprecated with #8841.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
+        Ok((self.generated_layers, self.inner.map(|x| x.1)))
     }
 }
 
@@ -432,10 +440,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -460,11 +466,22 @@ mod tests {
             )
             .await
             .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(layers.len(), 1);
+        assert_eq!(
+            layers
+                .into_iter()
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
     }
 
     #[tokio::test]
@@ -501,10 +518,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -533,10 +548,7 @@ mod tests {
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         if discard {
             for layer in image_layers {
                 layer.into_discarded_layer();
@@ -555,6 +567,14 @@ mod tests {
                 .collect_vec();
             assert_eq!(image_layers.len(), N / 512 + 1);
             assert_eq!(delta_layers.len(), N / 512 + 1);
+            assert_eq!(
+                delta_layers.first().unwrap().layer_desc().key_range.start,
+                get_key(0)
+            );
+            assert_eq!(
+                delta_layers.last().unwrap().layer_desc().key_range.end,
+                get_key(N as u32)
+            );
             for idx in 0..image_layers.len() {
                 assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
                 assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
@@ -602,10 +622,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -644,11 +662,35 @@ mod tests {
             )
             .await
             .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(layers.len(), 2);
+        let mut layers_iter = layers.into_iter();
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(1)..get_key(2),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
     }
 
     #[tokio::test]
@@ -668,10 +710,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -689,10 +729,20 @@ mod tests {
                 .await
                 .unwrap();
         }
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(delta_layers.len(), 1);
+        let delta_layer = delta_layers
+            .into_iter()
+            .next()
+            .unwrap()
+            .into_resident_layer();
+        assert_eq!(
+            delta_layer.layer_desc().key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
+                is_delta: true
+            }
+        );
     }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a87b502cd6..0b5c520ba7 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1809,7 +1809,6 @@ impl Timeline {
             .unwrap();
         // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
         // as an L0 layer.
-        let hack_end_key = Key::NON_L0_MAX;
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
         let mut downloaded_layers = Vec::new();
@@ -1855,10 +1854,8 @@ impl Timeline {
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            Key::MIN,
             lowest_retain_lsn..end_lsn,
             self.get_compaction_target_size(),
-            ctx,
         )
         .await?;
 
@@ -1965,7 +1962,7 @@ impl Timeline {
         let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
                 writer
-                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
                     .await?
             } else {
                 let (layers, _) = writer.take()?;
@@ -1978,7 +1975,7 @@ impl Timeline {
 
         let produced_delta_layers = if !dry_run {
             delta_layer_writer
-                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                .finish_with_discard_fn(self, ctx, discard)
                 .await?
         } else {
             let (layers, _) = delta_layer_writer.take()?;

From 982b376ea2e42d45f70c625ec91ac513f9f3a661 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 10 Sep 2024 00:04:00 +0300
Subject: [PATCH 042/142] Update parquet crate to a released version (#8961)

PR #7782 set the dependency in Cargo.toml to 'master', and locked the
version to commit that contained a specific fix, because we needed the
fix before it was included in a versioned release. The fix was later
included in parquet crate version 52.0.0, so we can now switch back to
using a released version. The latest release is 53.0.0, switch straight
to that.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 Cargo.lock                   | 10 +++++---
 Cargo.toml                   |  8 ++----
 proxy/src/context/parquet.rs | 48 ++++++++++++++++++------------------
 workspace_hack/Cargo.toml    |  4 +--
 4 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4fb3ac7223..3ca6acbc3e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3907,8 +3907,9 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "53.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
 dependencies = [
  "ahash",
  "bytes",
@@ -3927,8 +3928,9 @@ dependencies = [
 
 [[package]]
 name = "parquet_derive"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "53.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
 dependencies = [
  "parquet",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index 40e399619d..fd1d4e016c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -123,8 +123,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "51.0.0"
+parquet = { version = "53", default-features = false, features = ["zstd"] }
+parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.16"
@@ -254,10 +254,6 @@ tonic-build = "0.9"
 # Needed to get `tokio-postgres-rustls` to depend on our fork.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
-# bug fixes for UUID
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-
 ################# Binary contents sections
 
 [profile.release]
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index c6f83fd069..fafea2a08f 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -598,15 +598,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1312632, 3, 6000),
+                (1312621, 3, 6000),
+                (1312680, 3, 6000),
+                (1312637, 3, 6000),
+                (1312773, 3, 6000),
+                (1312610, 3, 6000),
+                (1312404, 3, 6000),
+                (1312639, 3, 6000),
+                (437848, 1, 2000)
             ]
         );
 
@@ -638,11 +638,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1208861, 5, 10000),
-                (1208592, 5, 10000),
-                (1208885, 5, 10000),
-                (1208873, 5, 10000),
-                (1209128, 5, 10000)
+                (1203465, 5, 10000),
+                (1203189, 5, 10000),
+                (1203490, 5, 10000),
+                (1203475, 5, 10000),
+                (1203729, 5, 10000)
             ]
         );
 
@@ -667,15 +667,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1312632, 3, 6000),
+                (1312621, 3, 6000),
+                (1312680, 3, 6000),
+                (1312637, 3, 6000),
+                (1312773, 3, 6000),
+                (1312610, 3, 6000),
+                (1312404, 3, 6000),
+                (1312639, 3, 6000),
+                (437848, 1, 2000)
             ]
         );
 
@@ -712,7 +712,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
+            [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 411ca81032..140c43639e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+parquet = { version = "53", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -116,7 +116,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+parquet = { version = "53", default-features = false, features = ["zstd"] }
 proc-macro2 = { version = "1" }
 prost = { version = "0.11" }
 quote = { version = "1" }

From 842be0ba74c4c6e4245c29c3fffae4401d282c4a Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 10 Sep 2024 00:01:52 +0200
Subject: [PATCH 043/142] Specialize WalIngest on PostgreSQL version (#8904)

The current code assumes that most of this functionality is
version-independent, which is only true up to v16 - PostgreSQL 17 has a
new field in CheckPoint that we need to keep track of.

This basically removes the file-level dependency on v14, and replaces it
with switches that load the correct version dependencies where required.
---
 libs/postgres_ffi/build.rs   |   1 +
 libs/postgres_ffi/src/lib.rs | 104 ++++++++++++++++
 pageserver/src/walingest.rs  | 235 +++++++++++++++++++++--------------
 3 files changed, 249 insertions(+), 91 deletions(-)

diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index d3e3ce648f..a346390f3d 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -121,6 +121,7 @@ fn main() -> anyhow::Result<()> {
             .allowlist_type("XLogPageHeaderData")
             .allowlist_type("XLogLongPageHeaderData")
             .allowlist_var("XLOG_PAGE_MAGIC")
+            .allowlist_var("PG_MAJORVERSION_NUM")
             .allowlist_var("PG_CONTROL_FILE_SIZE")
             .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
             .allowlist_type("PageHeaderData")
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 9acb105e9b..f18e0c603b 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -44,6 +44,9 @@ macro_rules! postgres_ffi {
             // Re-export some symbols from bindings
             pub use bindings::DBState_DB_SHUTDOWNED;
             pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
+
+            pub const ZERO_CHECKPOINT: bytes::Bytes =
+                bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
         }
     };
 }
@@ -106,6 +109,107 @@ macro_rules! dispatch_pgversion {
     };
 }
 
+#[macro_export]
+macro_rules! enum_pgversion_dispatch {
+    ($name:expr, $typ:ident, $bind:ident, $code:block) => {
+        enum_pgversion_dispatch!(
+            name = $name,
+            bind = $bind,
+            typ = $typ,
+            code = $code,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        )
+    };
+    (name = $name:expr,
+     bind = $bind:ident,
+     typ = $typ:ident,
+     code = $code:block,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
+        match $name {
+            $(
+            self::$typ::$variant($bind) => {
+                use $crate::$md as pgv;
+                $code
+            }
+            ),+,
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! enum_pgversion {
+    {$name:ident, pgv :: $t:ident} => {
+        enum_pgversion!{
+            name = $name,
+            typ = $t,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        }
+    };
+    {$name:ident, pgv :: $p:ident :: $t:ident} => {
+        enum_pgversion!{
+            name = $name,
+            path = $p,
+            typ = $t,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        }
+    };
+    {name = $name:ident,
+     typ = $t:ident,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
+        pub enum $name {
+            $($variant ( $crate::$md::$t )),+
+        }
+        impl self::$name {
+            pub fn pg_version(&self) -> u32 {
+                enum_pgversion_dispatch!(self, $name, _ign, {
+                    pgv::bindings::PG_MAJORVERSION_NUM
+                })
+            }
+        }
+        $(
+        impl Into<self::$name> for $crate::$md::$t {
+            fn into(self) -> self::$name {
+                self::$name::$variant (self)
+            }
+        }
+        )+
+    };
+    {name = $name:ident,
+     path = $p:ident,
+     typ = $t:ident,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
+        pub enum $name {
+            $($variant ($crate::$md::$p::$t)),+
+        }
+        impl $name {
+            pub fn pg_version(&self) -> u32 {
+                enum_pgversion_dispatch!(self, $name, _ign, {
+                    pgv::bindings::PG_MAJORVERSION_NUM
+                })
+            }
+        }
+        $(
+        impl Into<$name> for $crate::$md::$p::$t {
+            fn into(self) -> $name {
+                $name::$variant (self)
+            }
+        }
+        )+
+    };
+}
+
 pub mod pg_constants;
 pub mod relfile_utils;
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 2d3841881b..39bc9e385f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,9 +25,7 @@ use std::time::Duration;
 use std::time::SystemTime;
 
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
-use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
-use postgres_ffi::TimestampTz;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 
 use anyhow::{bail, Context, Result};
@@ -48,16 +46,31 @@ use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
-use postgres_ffi::v14::xlog_utils::*;
-use postgres_ffi::v14::CheckPoint;
 use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
+use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 
+enum_pgversion! {CheckPoint, pgv::CheckPoint}
+
+impl CheckPoint {
+    fn encode(&self) -> Result<Bytes, SerializeError> {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() })
+    }
+
+    fn update_next_xid(&mut self, xid: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) })
+    }
+
+    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, {
+            cp.update_next_multixid(multi_xid, multi_offset)
+        })
+    }
+}
+
 pub struct WalIngest {
     shard: ShardIdentity,
-    pg_version: u32,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
     warn_ingest_lag: WarnIngestLag,
@@ -78,12 +91,16 @@ impl WalIngest {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
-        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-        trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+        let pgversion = timeline.pg_version;
+
+        let checkpoint = dispatch_pgversion!(pgversion, {
+            let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+            trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+            <pgv::CheckPoint as Into<CheckPoint>>::into(checkpoint)
+        });
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
-            pg_version: timeline.pg_version,
             checkpoint,
             checkpoint_modified: false,
             warn_ingest_lag: WarnIngestLag {
@@ -117,7 +134,7 @@ impl WalIngest {
 
         modification.set_lsn(lsn)?;
 
-        if decoded.is_dbase_create_copy(self.pg_version) {
+        if decoded.is_dbase_create_copy(pg_version) {
             // Records of this type should always be preceded by a commit(), as they
             // rely on reading data pages back from the Timeline.
             assert!(!modification.has_dirty_data_pages());
@@ -337,70 +354,67 @@ impl WalIngest {
             pg_constants::RM_XLOG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
-                if info == pg_constants::XLOG_NEXTOID {
-                    let next_oid = buf.get_u32_le();
-                    if self.checkpoint.nextOid != next_oid {
-                        self.checkpoint.nextOid = next_oid;
+                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                    if info == pg_constants::XLOG_NEXTOID {
+                        let next_oid = buf.get_u32_le();
+                        if cp.nextOid != next_oid {
+                            cp.nextOid = next_oid;
+                            self.checkpoint_modified = true;
+                        }
+                    } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+                        || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
+                        buf.copy_to_slice(&mut checkpoint_bytes);
+                        let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+                        trace!(
+                            "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                            xlog_checkpoint.oldestXid,
+                            cp.oldestXid
+                        );
+                        if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
+                            cp.oldestXid = xlog_checkpoint.oldestXid;
+                        }
+                        trace!(
+                            "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                            xlog_checkpoint.oldestActiveXid,
+                            cp.oldestActiveXid
+                        );
+
+                        // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                        // because at shutdown, all in-progress transactions will implicitly
+                        // end. Postgres startup code knows that, and allows hot standby to start
+                        // immediately from a shutdown checkpoint.
+                        //
+                        // In Neon, Postgres hot standby startup always behaves as if starting from
+                        // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                        // instead of overwriting self.checkpoint.oldestActiveXid with
+                        // InvalidTransactionid from the checkpoint WAL record, update it to a
+                        // proper value, knowing that there are no in-progress transactions at this
+                        // point, except for prepared transactions.
+                        //
+                        // See also the neon code changes in the InitWalRecovery() function.
+                        if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                            && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                        {
+                            let mut oldest_active_xid = cp.nextXid.value as u32;
+                            for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                    oldest_active_xid = xid;
+                                }
+                            }
+                            cp.oldestActiveXid = oldest_active_xid;
+                        } else {
+                            cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                        }
+
+                        // Write a new checkpoint key-value pair on every checkpoint record, even
+                        // if nothing really changed. Not strictly required, but it seems nice to
+                        // have some trace of the checkpoint records in the layer files at the same
+                        // LSNs.
                         self.checkpoint_modified = true;
                     }
-                } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-                    || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                {
-                    let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
-                    buf.copy_to_slice(&mut checkpoint_bytes);
-                    let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-                    trace!(
-                        "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                        xlog_checkpoint.oldestXid,
-                        self.checkpoint.oldestXid
-                    );
-                    if (self
-                        .checkpoint
-                        .oldestXid
-                        .wrapping_sub(xlog_checkpoint.oldestXid) as i32)
-                        < 0
-                    {
-                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                    }
-                    trace!(
-                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
-                        xlog_checkpoint.oldestActiveXid,
-                        self.checkpoint.oldestActiveXid
-                    );
-
-                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
-                    // because at shutdown, all in-progress transactions will implicitly
-                    // end. Postgres startup code knows that, and allows hot standby to start
-                    // immediately from a shutdown checkpoint.
-                    //
-                    // In Neon, Postgres hot standby startup always behaves as if starting from
-                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
-                    // instead of overwriting self.checkpoint.oldestActiveXid with
-                    // InvalidTransactionid from the checkpoint WAL record, update it to a
-                    // proper value, knowing that there are no in-progress transactions at this
-                    // point, except for prepared transactions.
-                    //
-                    // See also the neon code changes in the InitWalRecovery() function.
-                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
-                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                    {
-                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
-                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                oldest_active_xid = xid;
-                            }
-                        }
-                        self.checkpoint.oldestActiveXid = oldest_active_xid;
-                    } else {
-                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
-                    }
-
-                    // Write a new checkpoint key-value pair on every checkpoint record, even
-                    // if nothing really changed. Not strictly required, but it seems nice to
-                    // have some trace of the checkpoint records in the layer files at the same
-                    // LSNs.
-                    self.checkpoint_modified = true;
-                }
+                });
             }
             pg_constants::RM_LOGICALMSG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -424,7 +438,11 @@ impl WalIngest {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
                 if info == pg_constants::XLOG_RUNNING_XACTS {
                     let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
-                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+
+                    enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                        cp.oldestActiveXid = xlrec.oldest_running_xid;
+                    });
+
                     self.checkpoint_modified = true;
                 }
             }
@@ -539,7 +557,7 @@ impl WalIngest {
             && blk.has_image
             && decoded.xl_rmid == pg_constants::RM_XLOG_ID
             && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
             && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
             // do not materialize null pages because them most likely be soon replaced with real data
@@ -1242,12 +1260,17 @@ impl WalIngest {
     fn warn_on_ingest_lag(
         &mut self,
         conf: &crate::config::PageServerConf,
-        wal_timestmap: TimestampTz,
+        wal_timestamp: TimestampTz,
     ) {
         debug_assert_current_span_has_tenant_and_timeline_id();
         let now = SystemTime::now();
         let rate_limits = &mut self.warn_ingest_lag;
-        match try_from_pg_timestamp(wal_timestmap) {
+
+        let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, {
+            pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp)
+        });
+
+        match ts {
             Ok(ts) => {
                 match now.duration_since(ts) {
                     Ok(lag) => {
@@ -1257,7 +1280,7 @@ impl WalIngest {
                                 warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                             })
                         }
-                    },
+                    }
                     Err(e) => {
                         let delta_t = e.duration();
                         // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
@@ -1271,7 +1294,6 @@ impl WalIngest {
                         }
                     }
                 };
-
             }
             Err(error) => {
                 rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
@@ -1379,14 +1401,17 @@ impl WalIngest {
         // truncated, but a checkpoint record with the updated values isn't written until
         // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
         // so we keep the oldestXid and oldestXidDB up-to-date.
-        self.checkpoint.oldestXid = xlrec.oldest_xid;
-        self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
+        enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+            cp.oldestXid = xlrec.oldest_xid;
+            cp.oldestXidDB = xlrec.oldest_xid_db;
+        });
         self.checkpoint_modified = true;
 
         // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
 
         let latest_page_number =
-            self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+            enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32
+                / pg_constants::CLOG_XACTS_PER_PAGE;
 
         // Now delete all segments containing pages between xlrec.pageno
         // and latest_page_number.
@@ -1394,7 +1419,9 @@ impl WalIngest {
         // First, make an important safety check:
         // the current endpoint page must not be eligible for removal.
         // See SimpleLruTruncate() in slru.c
-        if clogpage_precedes(latest_page_number, xlrec.pageno) {
+        if dispatch_pgversion!(modification.tline.pg_version, {
+            pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno)
+        }) {
             info!("could not truncate directory pg_xact apparent wraparound");
             return Ok(());
         }
@@ -1411,7 +1438,12 @@ impl WalIngest {
             .await?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
-            if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
+
+            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno)
+            });
+
+            if may_delete {
                 modification
                     .drop_slru_segment(SlruKind::Clog, segno, ctx)
                     .await?;
@@ -1530,14 +1562,23 @@ impl WalIngest {
         xlrec: &XlMultiXactTruncate,
         ctx: &RequestContext,
     ) -> Result<()> {
-        self.checkpoint.oldestMulti = xlrec.end_trunc_off;
-        self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
+        let (maxsegment, startsegment, endsegment) =
+            enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                cp.oldestMulti = xlrec.end_trunc_off;
+                cp.oldestMultiDB = xlrec.oldest_multi_db;
+                let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(
+                    pg_constants::MAX_MULTIXACT_OFFSET,
+                );
+                let startsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb);
+                let endsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb);
+                (maxsegment, startsegment, endsegment)
+            });
+
         self.checkpoint_modified = true;
 
         // PerformMembersTruncation
-        let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
-        let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb);
-        let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb);
         let mut segment: i32 = startsegment;
 
         // Delete all the segments except the last one. The last segment can still
@@ -1811,11 +1852,23 @@ mod tests {
         // TODO
     }
 
-    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
+    #[tokio::test]
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+        for i in 14..=16 {
+            dispatch_pgversion!(i, {
+                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
+            });
+        }
+
+        Ok(())
+    }
 
     async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
-        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
+        m.put_checkpoint(dispatch_pgversion!(
+            tline.pg_version,
+            pgv::ZERO_CHECKPOINT.clone()
+        ))?;
         m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
         m.commit(ctx).await?;
         let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

From 97582178cb576f8b68acc53535adf8918d7dbd94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 10 Sep 2024 02:40:00 +0200
Subject: [PATCH 044/142] Remove async_trait from the Handler trait (#8958)

Newest attempt to remove `async_trait` from the Handler trait.

Earlier attempts were in #7301 and #8296 .
---
 libs/postgres_backend/src/lib.rs             |  5 +-
 libs/postgres_backend/tests/simple_select.rs |  1 -
 pageserver/src/page_service.rs               |  1 -
 proxy/src/console/mgmt.rs                    |  2 +-
 safekeeper/src/handler.rs                    | 80 ++++++++++----------
 5 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 600f1d728c..8ea4b93fb1 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -81,17 +81,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
     )
 }
 
-#[async_trait::async_trait]
 pub trait Handler<IO> {
     /// Handle single query.
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care). It will also flush out the output buffer.
-    async fn process_query(
+    fn process_query(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         query_string: &str,
-    ) -> Result<(), QueryError>;
+    ) -> impl Future<Output = Result<(), QueryError>>;
 
     /// Called on startup packet receival, allows to process params.
     ///
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 7ec85f0dbe..900083ea7f 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -23,7 +23,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {
 
 struct TestHandler {}
 
-#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
     // return single col 'hey' for any query
     async fn process_query(
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 39c6a6fb74..9261b7481d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,6 @@ impl PageServerHandler {
     }
 }
 
-#[async_trait::async_trait]
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
     IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 2ed4f5f206..ee5f83ee76 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -78,7 +78,7 @@ pub(crate) type ComputeReady = DatabaseInfo;
 
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
-#[async_trait::async_trait]
+
 impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
     async fn process_query(
         &mut self,
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 2c519433ef..3f00b69cde 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,6 +2,7 @@
 //! protocol commands.
 
 use anyhow::Context;
+use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -95,7 +96,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
     }
 }
 
-#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
     for SafekeeperPostgresHandler
 {
@@ -197,49 +197,51 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
         Ok(())
     }
 
-    async fn process_query(
+    fn process_query(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         query_string: &str,
-    ) -> Result<(), QueryError> {
-        if query_string
-            .to_ascii_lowercase()
-            .starts_with("set datestyle to ")
-        {
-            // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-            return Ok(());
-        }
-
-        let cmd = parse_cmd(query_string)?;
-        let cmd_str = cmd_to_string(&cmd);
-
-        let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
-
-        info!("got query {:?}", query_string);
-
-        let tenant_id = self.tenant_id.context("tenantid is required")?;
-        let timeline_id = self.timeline_id.context("timelineid is required")?;
-        self.check_permission(Some(tenant_id))?;
-        self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
-
-        match cmd {
-            SafekeeperPostgresCommand::StartWalPush => {
-                self.handle_start_wal_push(pgb)
-                    .instrument(info_span!("WAL receiver"))
-                    .await
+    ) -> impl Future<Output = Result<(), QueryError>> {
+        Box::pin(async move {
+            if query_string
+                .to_ascii_lowercase()
+                .starts_with("set datestyle to ")
+            {
+                // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                return Ok(());
             }
-            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
-                self.handle_start_replication(pgb, start_lsn, term)
-                    .instrument(info_span!("WAL sender"))
-                    .await
+
+            let cmd = parse_cmd(query_string)?;
+            let cmd_str = cmd_to_string(&cmd);
+
+            let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
+
+            info!("got query {:?}", query_string);
+
+            let tenant_id = self.tenant_id.context("tenantid is required")?;
+            let timeline_id = self.timeline_id.context("timelineid is required")?;
+            self.check_permission(Some(tenant_id))?;
+            self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
+
+            match cmd {
+                SafekeeperPostgresCommand::StartWalPush => {
+                    self.handle_start_wal_push(pgb)
+                        .instrument(info_span!("WAL receiver"))
+                        .await
+                }
+                SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
+                    self.handle_start_replication(pgb, start_lsn, term)
+                        .instrument(info_span!("WAL sender"))
+                        .await
+                }
+                SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
+                SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
+                SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
+                    handle_json_ctrl(self, pgb, cmd).await
+                }
             }
-            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
-            SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
-            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
-                handle_json_ctrl(self, pgb, cmd).await
-            }
-        }
+        })
     }
 }
 

From 26b5fcdc5077e5f4051f27c2e2d8f82ac5038acb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 10 Sep 2024 12:54:25 +0100
Subject: [PATCH 045/142] reinstate write-path key check (#8973)

## Problem

In https://github.com/neondatabase/neon/pull/8621, validation of keys
during ingest was removed because the places where we actually store
keys are now past the point where we have already converted them to
CompactKey (i128) representation.

## Summary of changes

Reinstate validation at an earlier stage in ingest. This doesn't cover
literally every place we write a key, but it covers most cases where
we're trusting postgres to give us a valid key (i.e. one that doesn't
try and use a custom spacenode).
---
 pageserver/src/pgdatadir_mapping.rs | 49 ++++++++++++++++++++++++-----
 pageserver/src/walingest.rs         |  8 ++---
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 808d4b666e..6dd8851b13 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1205,6 +1205,13 @@ impl<'a> DatadirModification<'a> {
         img: Bytes,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
     }
@@ -1216,14 +1223,34 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
-        self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
+        self.put(key, Value::Image(img));
         Ok(())
     }
 
-    pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
-        self.pending_zero_data_pages
-            .insert(rel_block_to_key(rel, blknum).to_compact());
+    pub(crate) fn put_rel_page_image_zero(
+        &mut self,
+        rel: RelTag,
+        blknum: BlockNumber,
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
         self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
     }
 
     pub(crate) fn put_slru_page_image_zero(
@@ -1231,10 +1258,18 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) {
-        self.pending_zero_data_pages
-            .insert(slru_block_to_key(kind, segno, blknum).to_compact());
+    ) -> anyhow::Result<()> {
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
         self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
     }
 
     /// Call this at the end of each WAL record.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 39bc9e385f..6e15ad81c3 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1222,7 +1222,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                 // Tail of last remaining FSM page has to be zeroed.
                 // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                 fsm_physical_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1244,7 +1244,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                 // Tail of last remaining vm page has to be zeroed.
                 // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, vm_page_no);
+                modification.put_rel_page_image_zero(rel, vm_page_no)?;
                 vm_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1737,7 +1737,7 @@ impl WalIngest {
                     continue;
                 }
 
-                modification.put_rel_page_image_zero(rel, gap_blknum);
+                modification.put_rel_page_image_zero(rel, gap_blknum)?;
             }
         }
         Ok(())
@@ -1803,7 +1803,7 @@ impl WalIngest {
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image_zero(kind, segno, gap_blknum);
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum)?;
             }
         }
         Ok(())

From bae793ffcd90470b26380053fe931a91545798a5 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 10 Sep 2024 15:36:08 +0200
Subject: [PATCH 046/142] proxy: Handle all let underscore instances (#8898)

* Most can be simply replaced
* One instance renamed to _rtchk (return-type check)
---
 proxy/src/cache/endpoints.rs                     |  2 +-
 proxy/src/console/messages.rs                    | 16 ++++++++--------
 proxy/src/context.rs                             | 10 +++++++---
 proxy/src/context/parquet.rs                     |  2 +-
 proxy/src/lib.rs                                 |  2 +-
 proxy/src/proxy/tests.rs                         |  2 +-
 .../connection_with_credentials_provider.rs      |  7 +++++--
 proxy/src/stream.rs                              | 15 +++++++++------
 proxy/src/url.rs                                 |  2 +-
 9 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index f4762232d8..27121ce89e 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -242,6 +242,6 @@ mod tests {
     #[test]
     fn test() {
         let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
-        let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
+        serde_json::from_str::<ControlPlaneEventKey>(s).unwrap();
     }
 }
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index a48c7316f6..9b66333cd4 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -395,7 +395,7 @@ mod tests {
                 }
             }
         });
-        let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<KickSession<'_>>(&json.to_string())?;
 
         Ok(())
     }
@@ -403,7 +403,7 @@ mod tests {
     #[test]
     fn parse_db_info() -> anyhow::Result<()> {
         // with password
-        let _: DatabaseInfo = serde_json::from_value(json!({
+        serde_json::from_value::<DatabaseInfo>(json!({
             "host": "localhost",
             "port": 5432,
             "dbname": "postgres",
@@ -413,7 +413,7 @@ mod tests {
         }))?;
 
         // without password
-        let _: DatabaseInfo = serde_json::from_value(json!({
+        serde_json::from_value::<DatabaseInfo>(json!({
             "host": "localhost",
             "port": 5432,
             "dbname": "postgres",
@@ -422,7 +422,7 @@ mod tests {
         }))?;
 
         // new field (forward compatibility)
-        let _: DatabaseInfo = serde_json::from_value(json!({
+        serde_json::from_value::<DatabaseInfo>(json!({
             "host": "localhost",
             "port": 5432,
             "dbname": "postgres",
@@ -441,7 +441,7 @@ mod tests {
             "address": "0.0.0.0",
             "aux": dummy_aux(),
         });
-        let _: WakeCompute = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<WakeCompute>(&json.to_string())?;
         Ok(())
     }
 
@@ -451,18 +451,18 @@ mod tests {
         let json = json!({
             "role_secret": "secret",
         });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
         });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
         let json = json!({
             "role_secret": "secret",
             "allowed_ips": ["8.8.8.8"],
             "project_id": "project",
         });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+        serde_json::from_str::<GetRoleSecret>(&json.to_string())?;
 
         Ok(())
     }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 72e1fa1cee..c013218ad9 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -6,7 +6,7 @@ use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{field::display, info, info_span, Span};
+use tracing::{debug, field::display, info, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
@@ -362,7 +362,9 @@ impl RequestMonitoringInner {
                 });
         }
         if let Some(tx) = self.sender.take() {
-            let _: Result<(), _> = tx.send(RequestData::from(&*self));
+            tx.send(RequestData::from(&*self))
+                .inspect_err(|e| debug!("tx send failed: {e}"))
+                .ok();
         }
     }
 
@@ -371,7 +373,9 @@ impl RequestMonitoringInner {
         // Here we log the length of the session.
         self.disconnect_timestamp = Some(Utc::now());
         if let Some(tx) = self.disconnect_sender.take() {
-            let _: Result<(), _> = tx.send(RequestData::from(&*self));
+            tx.send(RequestData::from(&*self))
+                .inspect_err(|e| debug!("tx send failed: {e}"))
+                .ok();
         }
     }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index fafea2a08f..9f6f83022e 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -290,7 +290,7 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
+        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
     }
 
     Ok(())
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 923d6ae288..0070839aa8 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -3,7 +3,7 @@
 #![deny(
     deprecated,
     future_incompatible,
-    // TODO: consider let_underscore
+    let_underscore,
     nonstandard_style,
     rust_2024_compatibility
 )]
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 4264dbae0f..752d982726 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -268,7 +268,7 @@ async fn keepalive_is_inherited() -> anyhow::Result<()> {
         anyhow::Ok(keepalive)
     });
 
-    let _ = TcpStream::connect(("127.0.0.1", port)).await?;
+    TcpStream::connect(("127.0.0.1", port)).await?;
     assert!(t.await??, "keepalive should be inherited");
 
     Ok(())
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 7d222e2dec..2de66b58b1 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -6,7 +6,7 @@ use redis::{
     ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
 };
 use tokio::task::JoinHandle;
-use tracing::{error, info};
+use tracing::{debug, error, info};
 
 use super::elasticache::CredentialsProvider;
 
@@ -109,7 +109,10 @@ impl ConnectionWithCredentialsProvider {
             let credentials_provider = credentials_provider.clone();
             let con2 = con.clone();
             let f = tokio::spawn(async move {
-                let _ = Self::keep_connection(con2, credentials_provider).await;
+                Self::keep_connection(con2, credentials_provider)
+                    .await
+                    .inspect_err(|e| debug!("keep_connection failed: {e}"))
+                    .ok();
             });
             self.refresh_token_task = Some(f);
         }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index c14dd18afe..e2fc73235e 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -12,6 +12,7 @@ use std::{io, task};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
+use tracing::debug;
 
 /// Stream wrapper which implements libpq's protocol.
 ///
@@ -138,9 +139,10 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         );
 
         // already error case, ignore client IO error
-        let _: Result<_, std::io::Error> = self
-            .write_message(&BeMessage::ErrorResponse(msg, None))
-            .await;
+        self.write_message(&BeMessage::ErrorResponse(msg, None))
+            .await
+            .inspect_err(|e| debug!("write_message failed: {e}"))
+            .ok();
 
         Err(ReportedError {
             source: anyhow::anyhow!(msg),
@@ -164,9 +166,10 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         );
 
         // already error case, ignore client IO error
-        let _: Result<_, std::io::Error> = self
-            .write_message(&BeMessage::ErrorResponse(&msg, None))
-            .await;
+        self.write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await
+            .inspect_err(|e| debug!("write_message failed: {e}"))
+            .ok();
 
         Err(ReportedError {
             source: anyhow::anyhow!(error),
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 28ac7efdfc..270cd7c24d 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -57,7 +57,7 @@ mod tests {
     fn bad_url() {
         let url = "test:foobar";
         url.parse::<url::Url>().expect("unexpected parsing failure");
-        let _ = url.parse::<ApiUrl>().expect_err("should not parse");
+        url.parse::<ApiUrl>().expect_err("should not parse");
     }
 
     #[test]

From cb060548fb2115ca6a57a95c6c947c45fc2095a6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 11 Sep 2024 18:45:34 +0100
Subject: [PATCH 047/142] libs: tweak PageserverUtilization::is_overloaded
 (#8946)

## Problem

Having run in production for a while, we see that nodes are generally
safely oversubscribed by about a factor of 2.

## Summary of changes

Tweak the is_overloaded method to check for utililzation over 200%
rather than over 100%
---
 libs/pageserver_api/src/models/utilization.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 844a0cda5d..641aa51989 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -89,8 +89,19 @@ impl PageserverUtilization {
 
     /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
     /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
+    ///
+    /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
+    /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
     pub fn is_overloaded(score: RawScore) -> bool {
-        score >= Self::UTILIZATION_FULL
+        // Why the factor of two?  This is unscientific but reflects behavior of real systems:
+        // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
+        //   startup and housekeeping jobs nice and responsive.  We can go to double this limit if needed
+        //   until some more nodes are deployed.
+        // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
+        //   hold its biggest timeline fully on disk, which is tends to be an over estimate when
+        //   some tenants are very idle and have dropped layers from disk.  In practice going up to
+        //   double is generally better than giving up and scheduling in a sub-optimal AZ.
+        score >= 2 * Self::UTILIZATION_FULL
     }
 
     pub fn adjust_shard_count_max(&mut self, shard_count: u32) {

From 43846b72fa488b96d37bbc40d691bbf4e4f8fdd3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:40:30 +0300
Subject: [PATCH 048/142] Remove unused "neon_local init --pg-version" arg

It has been unused since commit 8712e1899e, when it stopped creating
the initial timeline.
---
 control_plane/src/bin/neon_local.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1d66532d49..af6545f8d2 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1570,7 +1570,6 @@ fn cli() -> Command {
                         .value_parser(value_parser!(PathBuf))
                         .value_name("config")
                 )
-                .arg(pg_version_arg.clone())
                 .arg(force_arg)
         )
         .subcommand(

From aeca15008c15b211d74536439ff701e533a412ef Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 11 Sep 2024 10:55:41 +0300
Subject: [PATCH 049/142] Remove obsolete and misleading comment

The tenant ID was not actually generated here but in NeonEnvBuilder.
And the "neon_local init" command hasn't been able to generate the
initial tenant since 8712e1899e anyway.
---
 test_runner/fixtures/neon_fixtures.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 60887b9aed..22472559f4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1073,9 +1073,6 @@ class NeonEnv:
         self.pg_distrib_dir = config.pg_distrib_dir
         self.endpoint_counter = 0
         self.storage_controller_config = config.storage_controller_config
-
-        # generate initial tenant ID here instead of letting 'neon init' generate it,
-        # so that we don't need to dig it out of the config file afterwards.
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 

From 0a363c3dce2fbd76f6483f02c2273e3f7b205b3e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 11 Sep 2024 12:38:31 +0300
Subject: [PATCH 050/142] Add --timeline-id option to "neon_local timeline
 branch" command

Makes it consistent with the "timeline create" and "timeline import"
commands, which allowed you to pass the timeline id as argument. This
also makes it unnecessary to parse the timeline ID from the output in
the python function that calls it.
---
 control_plane/src/bin/neon_local.rs   |  4 ++-
 test_runner/fixtures/neon_fixtures.py | 43 ++++++++++-----------------
 2 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index af6545f8d2..144cd647c9 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -640,6 +640,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
         }
         Some(("branch", branch_match)) => {
             let tenant_id = get_tenant_id(branch_match, env)?;
+            let new_timeline_id =
+                parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate());
             let new_branch_name = branch_match
                 .get_one::<String>("branch-name")
                 .ok_or_else(|| anyhow!("No branch name provided"))?;
@@ -658,7 +660,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                 .map(|lsn_str| Lsn::from_str(lsn_str))
                 .transpose()
                 .context("Failed to parse ancestor start Lsn from the request")?;
-            let new_timeline_id = TimelineId::generate();
             let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
@@ -1582,6 +1583,7 @@ fn cli() -> Command {
             .subcommand(Command::new("branch")
                 .about("Create a new timeline, using another timeline as a base, copying its data")
                 .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
                 .arg(branch_name_arg.clone())
                 .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
                     .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 22472559f4..1c33d14154 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1518,14 +1518,6 @@ class PageserverPort:
     http: int
 
 
-CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"^Created timeline '(?P<timeline_id>[^']+)'", re.MULTILINE
-)
-TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"\s?(?P<branch_name>[^\s]+)\s\[(?P<timeline_id>[^\]]+)\]", re.MULTILINE
-)
-
-
 class AbstractNeonCli(abc.ABC):
     """
     A typed wrapper around an arbitrary Neon CLI tool.
@@ -1754,6 +1746,9 @@ class NeonCli(AbstractNeonCli):
         tenant_id: Optional[TenantId] = None,
         timeline_id: Optional[TimelineId] = None,
     ) -> TimelineId:
+        if timeline_id is None:
+            timeline_id = TimelineId.generate()
+
         cmd = [
             "timeline",
             "create",
@@ -1761,23 +1756,16 @@ class NeonCli(AbstractNeonCli):
             new_branch_name,
             "--tenant-id",
             str(tenant_id or self.env.initial_tenant),
+            "--timeline-id",
+            str(timeline_id),
             "--pg-version",
             self.env.pg_version,
         ]
 
-        if timeline_id is not None:
-            cmd.extend(["--timeline-id", str(timeline_id)])
-
         res = self.raw_cli(cmd)
         res.check_returncode()
 
-        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
-
-        created_timeline_id = None
-        if matches is not None:
-            created_timeline_id = matches.group("timeline_id")
-
-        return TimelineId(str(created_timeline_id))
+        return timeline_id
 
     def create_branch(
         self,
@@ -1785,12 +1773,17 @@ class NeonCli(AbstractNeonCli):
         ancestor_branch_name: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
         ancestor_start_lsn: Optional[Lsn] = None,
+        new_timeline_id: Optional[TimelineId] = None,
     ) -> TimelineId:
+        if new_timeline_id is None:
+            new_timeline_id = TimelineId.generate()
         cmd = [
             "timeline",
             "branch",
             "--branch-name",
             new_branch_name,
+            "--timeline-id",
+            str(new_timeline_id),
             "--tenant-id",
             str(tenant_id or self.env.initial_tenant),
         ]
@@ -1802,16 +1795,7 @@ class NeonCli(AbstractNeonCli):
         res = self.raw_cli(cmd)
         res.check_returncode()
 
-        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
-
-        created_timeline_id = None
-        if matches is not None:
-            created_timeline_id = matches.group("timeline_id")
-
-        if created_timeline_id is None:
-            raise Exception("could not find timeline id after `neon timeline create` invocation")
-        else:
-            return TimelineId(str(created_timeline_id))
+        return TimelineId(str(new_timeline_id))
 
     def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]:
         """
@@ -1820,6 +1804,9 @@ class NeonCli(AbstractNeonCli):
 
         # main [b49f7954224a0ad25cc0013ea107b54b]
         # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540]
+        TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile(  # type: ignore[type-arg]
+            r"\s?(?P<branch_name>[^\s]+)\s\[(?P<timeline_id>[^\]]+)\]", re.MULTILINE
+        )
         res = self.raw_cli(
             ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)]
         )

From 8dc069037b003e63c77683670be4e965384e794b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 11 Sep 2024 15:01:34 +0300
Subject: [PATCH 051/142] Remove NeonEnvBuilder.start() function

It feels wrong to me to start() from the builder object. Surely the
thing you start is the environment itself, not its configuration.
---
 test_runner/fixtures/neon_fixtures.py                    | 6 +-----
 test_runner/performance/test_storage_controller_scale.py | 2 +-
 test_runner/regress/test_compatibility.py                | 4 ++--
 test_runner/regress/test_sharding.py                     | 6 +++---
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1c33d14154..ee62372871 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -553,10 +553,6 @@ class NeonEnvBuilder:
         self.env = NeonEnv(self)
         return self.env
 
-    def start(self):
-        assert self.env is not None, "environment is not already initialized, call init() first"
-        self.env.start()
-
     def init_start(
         self,
         initial_tenant_conf: Optional[Dict[str, Any]] = None,
@@ -572,7 +568,7 @@ class NeonEnvBuilder:
         Configuring pageserver with remote storage is now the default. There will be a warning if pageserver is created without one.
         """
         env = self.init_configs(default_remote_storage_if_missing=default_remote_storage_if_missing)
-        self.start()
+        env.start()
 
         # Prepare the default branch to start the postgres on later.
         # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 297aedfbed..a186bbaceb 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -84,7 +84,7 @@ def test_storage_controller_many_tenants(
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 
     env = neon_env_builder.init_configs()
-    neon_env_builder.start()
+    env.start()
 
     # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
     # of shards are hitting the delayed path.
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 467e5b1734..b559be5f18 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -178,7 +178,7 @@ def test_backward_compatibility(
         neon_env_builder.num_safekeepers = 3
         env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
         env.pageserver.allowed_errors.append(ingest_lag_log_line)
-        neon_env_builder.start()
+        env.start()
 
         check_neon_works(
             env,
@@ -265,7 +265,7 @@ def test_forward_compatibility(
         # does not include logs from previous runs
         assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
 
-        neon_env_builder.start()
+        env.start()
 
         # ensure the specified pageserver is running
         assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bfd82242e9..4a84dca399 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -374,7 +374,7 @@ def test_sharding_split_smoke(
     non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
 
     env = neon_env_builder.init_configs(True)
-    neon_env_builder.start()
+    env.start()
     tenant_id = TenantId.generate()
     timeline_id = TimelineId.generate()
     env.neon_cli.create_tenant(
@@ -1436,7 +1436,7 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
 
     neon_env_builder.num_pageservers = 2
     env = neon_env_builder.init_configs()
-    neon_env_builder.start()
+    env.start()
 
     tenant_id = TenantId.generate()
     timeline_id = TimelineId.generate()
@@ -1475,7 +1475,7 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder):
     """
 
     env = neon_env_builder.init_configs()
-    neon_env_builder.start()
+    env.start()
 
     tenants = []
     n_tenants = 8

From 9e3ead3689b012d344afbac4fcbf000372bb9969 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 11 Sep 2024 18:43:42 +0100
Subject: [PATCH 052/142] Collect the last of on-demand WAL download in
 CreateReplicationSlot reverts

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 49d5e576a5..6f6d77fb59 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e
+Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 6e9a4ff624..0baa7346df 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b
+Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e52576e61f..3a65a507f3 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,11 +1,11 @@
 {
   "v16": [
     "16.4",
-    "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b"
+    "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"
   ],
   "v15": [
     "15.8",
-    "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e"
+    "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9"
   ],
   "v14": [
     "14.13",

From fcab61bdcd9e30f2e2f6ce5be59e34bb98068f2f Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Thu, 12 Sep 2024 15:55:12 +0100
Subject: [PATCH 053/142] Prototype implementation for private access poc
 (#8976)

## Problem
For the Private Access POC we want users to be able to disable access
from the public proxy. To limit the number of changes this can be done
by configuring an IP allowlist [ "255.255.255.255" ]. For the Private
Access proxy a new commandline flag allows to disable IP allowlist
completely.

See
https://www.notion.so/neondatabase/Neon-Private-Access-POC-Proposal-8f707754e1ab4190ad5709da7832f020?d=887495c15e884aa4973f973a8a0a582a#7ac6ec249b524a74adbeddc4b84b8f5f
for details about the POC.,

## Summary of changes
- Adding the commandline flag is_private_access_proxy=true will disable
IP allowlist
---
 proxy/src/auth/backend.rs       |  5 ++++-
 proxy/src/auth/credentials.rs   | 13 +++++++++++++
 proxy/src/bin/local_proxy.rs    |  1 +
 proxy/src/bin/proxy.rs          |  5 +++++
 proxy/src/config.rs             |  1 +
 proxy/src/serverless/backend.rs |  4 +++-
 6 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 1d28c6df31..5561c9c56d 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -311,7 +311,9 @@ async fn auth_quirks(
     let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+    if config.ip_allowlist_check_enabled
+        && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
+    {
         return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
     }
 
@@ -603,6 +605,7 @@ mod tests {
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
         rate_limit_ip_subnet: 64,
+        ip_allowlist_check_enabled: true,
     });
 
     async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 0e91ae570a..cba8601d14 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -538,4 +538,17 @@ mod tests {
         ));
         Ok(())
     }
+
+    #[test]
+    fn test_connection_blocker() {
+        fn check(v: serde_json::Value) -> bool {
+            let peer_addr = IpAddr::from([127, 0, 0, 1]);
+            let ip_list: Vec<IpPattern> = serde_json::from_value(v).unwrap();
+            check_peer_addr_is_in_list(&peer_addr, &ip_list)
+        }
+
+        assert!(check(json!([])));
+        assert!(check(json!(["127.0.0.1"])));
+        assert!(!check(json!(["255.255.255.255"])));
+    }
 }
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 08effeff99..6eba71df1b 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -224,6 +224,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
             rate_limiter_enabled: false,
             rate_limiter: BucketRateLimiter::new(vec![]),
             rate_limit_ip_subnet: 64,
+            ip_allowlist_check_enabled: true,
         },
         require_client_ip: false,
         handshake_timeout: Duration::from_secs(10),
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 7706a1f7cd..ca9aeb04d8 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -224,6 +224,10 @@ struct ProxyCliArgs {
     /// Whether to retry the wake_compute request
     #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
     wake_compute_retry: String,
+
+    /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_private_access_proxy: bool,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -682,6 +686,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
         rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
+        ip_allowlist_check_enabled: !args.is_private_access_proxy,
     };
 
     let config = Box::leak(Box::new(ProxyConfig {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index d7fc6eee22..1cda6d200c 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -64,6 +64,7 @@ pub struct AuthenticationConfig {
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
     pub rate_limit_ip_subnet: u8,
+    pub ip_allowlist_check_enabled: bool,
 }
 
 impl TlsConfig {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index f24e0478be..d163878528 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -50,7 +50,9 @@ impl PoolingBackend {
             .as_ref()
             .map(|()| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+        if config.ip_allowlist_check_enabled
+            && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
+        {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
         }
         if !self

From 78938d1b591b33d23495a0edb8b123cc5cac6a27 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Thu, 12 Sep 2024 23:18:41 +0100
Subject: [PATCH 054/142] [compute/postgres] feature: PostgreSQL 17 (#8573)

This adds preliminary PG17 support to Neon, based on RC1 / 2024-09-04
https://github.com/postgres/postgres/commit/07b828e9d4aa916f1763774787440d914eea69c4

NOTICE: The data produced by the included version of the PostgreSQL fork
may not be compatible with the future full release of PostgreSQL 17 due to
expected or unexpected future changes in magic numbers and internals.
DO NOT EXPECT DATA IN V17-TENANTS TO BE COMPATIBLE WITH THE 17.0
RELEASE!

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml |   19 +-
 .github/workflows/build_and_test.yml          |   27 +-
 .github/workflows/neon_extra_builds.yml       |   15 +
 .gitmodules                                   |    4 +
 Dockerfile                                    |   12 +-
 Dockerfile.compute-node                       |  289 +++-
 Makefile                                      |   56 +-
 compute_tools/src/compute.rs                  |   31 +-
 compute_tools/src/extension_server.rs         |    1 +
 control_plane/src/local_env.rs                |    2 +-
 control_plane/src/storage_controller.rs       |   40 +-
 libs/pageserver_api/src/key.rs                |   39 +-
 libs/postgres_ffi/build.rs                    |    2 +-
 libs/postgres_ffi/src/lib.rs                  |    5 +
 libs/postgres_ffi/src/pg_constants.rs         |   39 +-
 libs/postgres_ffi/src/pg_constants_v14.rs     |   27 +
 libs/postgres_ffi/src/pg_constants_v15.rs     |    2 +
 libs/postgres_ffi/src/pg_constants_v16.rs     |    2 +
 libs/postgres_ffi/src/pg_constants_v17.rs     |   55 +
 libs/postgres_ffi/wal_craft/src/lib.rs        |    2 +-
 libs/walproposer/build.rs                     |    9 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |   16 +-
 pageserver/src/basebackup.rs                  |   16 +-
 pageserver/src/config.rs                      |    2 +-
 pageserver/src/import_datadir.rs              |    6 +-
 pageserver/src/pgdatadir_mapping.rs           |  111 +-
 pageserver/src/walingest.rs                   |  191 ++-
 pageserver/src/walrecord.rs                   |   73 +-
 pgxn/neon/bitmap.h                            |   12 +
 pgxn/neon/file_cache.c                        |  504 ++++--
 pgxn/neon/libpagestore.c                      |    4 +
 pgxn/neon/neon_pgversioncompat.h              |   14 +-
 pgxn/neon/pagestore_client.h                  |   54 +-
 pgxn/neon/pagestore_smgr.c                    | 1360 ++++++++++++-----
 pgxn/neon/walproposer_pg.c                    |   39 +-
 pgxn/neon_rmgr/neon_rmgr_decode.c             |  399 ++++-
 pgxn/neon_walredo/inmem_smgr.c                |   79 +-
 pgxn/neon_walredo/inmem_smgr.h                |    2 +-
 pgxn/neon_walredo/walredoproc.c               |   14 +-
 test_runner/fixtures/common_types.py          |    2 +-
 test_runner/fixtures/neon_fixtures.py         |    9 +-
 test_runner/fixtures/pg_version.py            |    1 +
 .../5670669815/v17/ext_index.json             |    7 +
 test_runner/regress/test_compatibility.py     |    8 +-
 .../regress/test_download_extensions.py       |    2 +
 test_runner/regress/test_postgres_version.py  |   17 +-
 .../regress/test_timeline_detach_ancestor.py  |    3 +
 test_runner/regress/test_twophase.py          |   70 +-
 vendor/postgres-v17                           |    1 +
 49 files changed, 2907 insertions(+), 787 deletions(-)
 create mode 100644 libs/postgres_ffi/src/pg_constants_v17.rs
 create mode 100644 pgxn/neon/bitmap.h
 create mode 100644 test_runner/regress/data/extension_test/5670669815/v17/ext_index.json
 create mode 160000 vendor/postgres-v17

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index e18e6a1201..67152b6991 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -62,7 +62,7 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
+          for r in 14 15 16 17; do
             git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
             git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
           done
@@ -83,6 +83,10 @@ jobs:
         id: pg_v16_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
 
+      - name: Set pg 17 revision for caching
+        id: pg_v17_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
+
       # Set some environment variables used by all the steps.
       #
       # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -136,6 +140,13 @@ jobs:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
+      - name: Cache postgres v17 build
+        id: cache_pg_17
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v17
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
         run: mold -run make postgres-v14 -j$(nproc)
@@ -148,6 +159,10 @@ jobs:
         if: steps.cache_pg_16.outputs.cache-hit != 'true'
         run: mold -run make postgres-v16 -j$(nproc)
 
+      - name: Build postgres v17
+        if: steps.cache_pg_17.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v17 -j$(nproc)
+
       - name: Build neon extensions
         run: mold -run make neon-pg-ext -j$(nproc)
 
@@ -210,7 +225,7 @@ jobs:
         run: |
           PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
           export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
           export LD_LIBRARY_PATH
 
           #nextest does not yet support running doctests
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 4bb9e5cb66..7c06fd9ab8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -211,7 +211,7 @@ jobs:
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
       # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
+      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -548,7 +548,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15, v16, v17 ]
         arch: [ x64, arm64 ]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -627,7 +627,7 @@ jobs:
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
-        if: matrix.version == 'v16'
+        if: matrix.version == 'v17'
         uses: docker/build-push-action@v6
         with:
           target: compute-tools-image
@@ -649,7 +649,7 @@ jobs:
 
     strategy:
       matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15, v16, v17 ]
 
     steps:
       - uses: docker/login-action@v3
@@ -671,7 +671,7 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
 
       - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v16'
+        if: matrix.version == 'v17'
         run: |
           docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                              neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
@@ -689,7 +689,7 @@ jobs:
                                                                                 neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v16'
+        if: matrix.version == 'v17'
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                                                                 neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
@@ -700,7 +700,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: [ v14, v15, v16 ]
+        version: [ v14, v15, v16, v17 ]
     env:
       VM_BUILDER_VERSION: v0.29.3
 
@@ -798,7 +798,7 @@ jobs:
     runs-on: ubuntu-22.04
 
     env:
-      VERSIONS: v14 v15 v16
+      VERSIONS: v14 v15 v16 v17
 
     steps:
       - uses: docker/login-action@v3
@@ -839,7 +839,7 @@ jobs:
             done
           done
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
-                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
+                                              neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
       - name: Login to prod ECR
         uses: docker/login-action@v3
@@ -852,7 +852,7 @@ jobs:
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
@@ -864,7 +864,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -876,7 +876,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -971,7 +971,7 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
+          for r in 14 15 16 17; do
             git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
             git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
           done
@@ -1117,6 +1117,7 @@ jobs:
 
               files_to_promote+=("s3://${BUCKET}/${s3_key}")
 
+              # TODO Add v17
               for pg_version in v14 v15 v16; do
                 # We run less tests for debug builds, so we don't need to promote them
                 if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 7fecdbde8c..41c9f5dee5 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -72,6 +72,10 @@ jobs:
         id: pg_v16_rev
         run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
 
+      - name: Set pg 17 revision for caching
+        id: pg_v17_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
+
       - name: Cache postgres v14 build
         id: cache_pg_14
         uses: actions/cache@v4
@@ -93,6 +97,13 @@ jobs:
           path: pg_install/v16
           key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
+      - name: Cache postgres v17 build
+        id: cache_pg_17
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v17
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
       - name: Set extra env for macOS
         run: |
           echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -120,6 +131,10 @@ jobs:
         if: steps.cache_pg_16.outputs.cache-hit != 'true'
         run: make postgres-v16 -j$(sysctl -n hw.ncpu)
 
+      - name: Build postgres v17
+        if: steps.cache_pg_17.outputs.cache-hit != 'true'
+        run: make postgres-v17 -j$(sysctl -n hw.ncpu)
+
       - name: Build neon extensions
         run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
 
diff --git a/.gitmodules b/.gitmodules
index 1d925674a1..d1330bf28c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,7 @@
 	path = vendor/postgres-v16
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_16_STABLE_neon
+[submodule "vendor/postgres-v17"]
+	path = vendor/postgres-v17
+	url = https://github.com/neondatabase/postgres.git
+	branch = REL_17_STABLE_neon
diff --git a/Dockerfile b/Dockerfile
index 1efedfa9bc..bdb76a4f4f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,6 +5,8 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
+ARG DEFAULT_PG_VERSION=17
+ARG STABLE_PG_VERSION=16
 
 # Build Postgres
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
@@ -13,6 +15,7 @@ WORKDIR /home/nonroot
 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
+COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -28,16 +31,19 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
+ARG STABLE_PG_VERSION
 
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
+COPY --from=pg-build /home/nonroot/pg_install/v17/lib                       pg_install/v17/lib
 COPY --chown=nonroot . .
 
 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
@@ -52,6 +58,7 @@ RUN set -e \
 # Build final image
 #
 FROM debian:bullseye-slim
+ARG DEFAULT_PG_VERSION
 WORKDIR /data
 
 RUN set -e \
@@ -77,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
 COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
+COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
 
 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
@@ -93,7 +101,7 @@ RUN mkdir -p /data/.neon/ && \
 
 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH=/usr/local/v16/lib
+ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
 
 
 VOLUME ["/data"]
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index b6c89cd71f..fe902eb978 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -79,15 +79,23 @@ RUN cd postgres && \
 #
 #########################################################################################
 FROM build-deps AS postgis-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt update && \
     apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
     libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
     protobuf-c-compiler xsltproc
 
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
-RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+        mkdir -p /sfcgal && \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
     echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
     mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
     cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -96,7 +104,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
 
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -122,7 +133,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg
     cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
     cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
 
-RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
     echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
     mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
@@ -142,12 +156,19 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 #
 #########################################################################################
 FROM build-deps AS plv8-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN apt update && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
     echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
     mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
     # generate and copy upgrade scripts
@@ -172,9 +193,13 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.t
 #
 #########################################################################################
 FROM build-deps AS h3-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "$(uname -m)" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "$(uname -m)" in \
       "x86_64") \
         export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
         ;; \
@@ -192,7 +217,11 @@ RUN case "$(uname -m)" in \
       && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
       && rm /tmp/cmake-install.sh
 
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+        mkdir -p /h3/usr/ && \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
     mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
@@ -202,7 +231,10 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
     echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
@@ -218,9 +250,13 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 #
 #########################################################################################
 FROM build-deps AS unit-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
     echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
     mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -239,6 +275,7 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 #
 #########################################################################################
 FROM build-deps AS vector-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/pgvector.patch /pgvector.patch
@@ -246,7 +283,10 @@ COPY patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
     echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
@@ -261,10 +301,14 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O
 #
 #########################################################################################
 FROM build-deps AS pgjwt-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
-RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
     echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
     mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -277,9 +321,13 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 #
 #########################################################################################
 FROM build-deps AS hypopg-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
     echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -293,9 +341,13 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypo
 #
 #########################################################################################
 FROM build-deps AS pg-hashids-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
     mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -309,11 +361,15 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 #
 #########################################################################################
 FROM build-deps AS rum-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/rum.patch /rum.patch
 
-RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
     echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     patch -p1 < /rum.patch && \
@@ -328,9 +384,13 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r
 #
 #########################################################################################
 FROM build-deps AS pgtap-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
     echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
     mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -344,9 +404,13 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 #
 #########################################################################################
 FROM build-deps AS ip4r-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -360,9 +424,13 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 #
 #########################################################################################
 FROM build-deps AS prefix-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -376,9 +444,13 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 #
 #########################################################################################
 FROM build-deps AS hll-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -392,9 +464,13 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 #
 #########################################################################################
 FROM build-deps AS plpgsql-check-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
     echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -413,7 +489,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "${PG_VERSION}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
         export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
@@ -446,7 +525,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "${PG_VERSION}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
         export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -459,6 +541,9 @@ RUN case "${PG_VERSION}" in \
         export PG_HINT_PLAN_VERSION=16_1_6_0 \
         export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
         ;; \
+      "v17") \
+        echo "TODO: PG17 pg_hint_plan support" && exit 0 \
+        ;; \
       *) \
         echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
         ;; \
@@ -478,10 +563,14 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 FROM build-deps AS pg-cron-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
     echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -495,9 +584,13 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O
 #
 #########################################################################################
 FROM build-deps AS rdkit-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN apt-get update && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt-get update && \
     apt-get install -y \
         cmake \
         libboost-iostreams1.74-dev \
@@ -507,7 +600,10 @@ RUN apt-get update && \
         libeigen3-dev
 
 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
     echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
@@ -544,10 +640,14 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 #
 #########################################################################################
 FROM build-deps AS pg-uuidv7-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -561,10 +661,14 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 #
 #########################################################################################
 FROM build-deps AS pg-roaringbitmap-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -578,10 +682,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 #
 #########################################################################################
 FROM build-deps AS pg-semver-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
     mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -599,7 +707,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
         export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
@@ -620,10 +731,14 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 FROM build-deps AS pg-anon-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -641,6 +756,7 @@ RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tag
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt-get update && \
@@ -651,9 +767,11 @@ ENV HOME=/home/nonroot
 ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
-ARG PG_VERSION
 
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
@@ -672,7 +790,10 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
     echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
@@ -694,7 +815,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
-RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
     echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -714,7 +838,10 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION
 
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     # TODO update pgrx version in the pg_tiktoken repo and remove this line
@@ -733,7 +860,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
     echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -748,10 +878,14 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 #########################################################################################
 
 FROM build-deps AS wal2json-pg-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -764,10 +898,14 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 #
 #########################################################################################
 FROM build-deps AS pg-ivm-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -781,10 +919,14 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 #
 #########################################################################################
 FROM build-deps AS pg-partman-build
+ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -835,7 +977,10 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon \
         -s install && \
@@ -854,8 +999,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
     case "${PG_VERSION}" in \
         "v14" | "v15") \
         ;; \
-        "v16") \
-            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
+        "v16" | "v17") \
+            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
         ;; \
         *) \
             echo "unexpected PostgreSQL version" && exit 1 \
@@ -878,7 +1023,10 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
 
 #########################################################################################
 #
@@ -899,15 +1047,24 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
 
 # Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    rm -r /usr/local/pgsql/include
 
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    rm /usr/local/pgsql/lib/lib*.a
 
 
 #########################################################################################
@@ -918,7 +1075,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 
 FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
-RUN mkdir /ext-src
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    mkdir /ext-src
 
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
@@ -956,18 +1116,39 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
 COPY patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN cd /ext-src/ && for f in *.tar.gz; \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/ && for f in *.tar.gz; \
     do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/rum-src && patch -p1 <../rum.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 # cmake is required for the h3 test
-RUN apt-get update && apt-get install -y cmake
-RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    apt-get update && apt-get install -y cmake
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN patch -p1 </ext-src/pg_anon.patch
-RUN patch -p1 </ext-src/pg_cron.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    patch -p1 </ext-src/pg_anon.patch
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
diff --git a/Makefile b/Makefile
index de298303e3..b9bb1c147d 100644
--- a/Makefile
+++ b/Makefile
@@ -119,6 +119,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
+.PHONY: postgres-configure-v17
+postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status
 .PHONY: postgres-configure-v16
 postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
@@ -215,29 +217,31 @@ neon-pg-clean-ext-%:
 # they depend on openssl and other libraries that are not included in our
 # Rust build.
 .PHONY: walproposer-lib
-walproposer-lib: neon-pg-ext-v16
+walproposer-lib: neon-pg-ext-v17
 	+@echo "Compiling walproposer-lib"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-ifeq ($(UNAME_S),Linux)
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
 		pg_strong_random.o
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
-		pg_crc32c.o \
-		hmac_openssl.o \
+		checksum_helper.o \
 		cryptohash_openssl.o \
-		scram-common.o \
+		hmac_openssl.o \
 		md5_common.o \
-		checksum_helper.o
+		parse_manifest.o \
+		scram-common.o
+ifeq ($(UNAME_S),Linux)
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+		pg_crc32c.o
 endif
 
 .PHONY: walproposer-lib-clean
 walproposer-lib-clean:
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \
 		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
 
@@ -245,38 +249,44 @@ walproposer-lib-clean:
 neon-pg-ext: \
 	neon-pg-ext-v14 \
 	neon-pg-ext-v15 \
-	neon-pg-ext-v16
+	neon-pg-ext-v16 \
+	neon-pg-ext-v17
 
 .PHONY: neon-pg-clean-ext
 neon-pg-clean-ext: \
 	neon-pg-clean-ext-v14 \
 	neon-pg-clean-ext-v15 \
-	neon-pg-clean-ext-v16
+	neon-pg-clean-ext-v16 \
+	neon-pg-clean-ext-v17
 
 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
 	postgres-v15 \
-	postgres-v16
+	postgres-v16 \
+	postgres-v17
 
 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
 	postgres-headers-v15 \
-	postgres-headers-v16
+	postgres-headers-v16 \
+	postgres-headers-v17
 
 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
 	postgres-clean-v15 \
-	postgres-clean-v16
+	postgres-clean-v16 \
+	postgres-clean-v17
 
 .PHONY: postgres-check
 postgres-check: \
 	postgres-check-v14 \
 	postgres-check-v15 \
-	postgres-check-v16
+	postgres-check-v16 \
+	postgres-check-v17
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
@@ -321,13 +331,13 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 	rm -f pg*.BAK
 
 # Indent pxgn/neon.
-.PHONY: pgindent
-neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
-		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
-		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
+.PHONY: neon-pgindent
+neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
+		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
 
 
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5bd6897fe3..1f47bb58a3 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1052,26 +1052,19 @@ impl ComputeNode {
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
 
         let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary {
-            if !pspec.spec.skip_pg_catalog_updates {
-                let pgdata_path = Path::new(&self.pgdata);
-                // temporarily reset max_cluster_size in config
-                // to avoid the possibility of hitting the limit, while we are applying config:
-                // creating new extensions, roles, etc...
-                config::with_compute_ctl_tmp_override(
-                    pgdata_path,
-                    "neon.max_cluster_size=-1",
-                    || {
-                        self.pg_reload_conf()?;
-
-                        self.apply_config(&compute_state)?;
-
-                        Ok(())
-                    },
-                )?;
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
                 self.pg_reload_conf()?;
-            }
-            self.post_apply_config()?;
+
+                self.apply_config(&compute_state)?;
+
+                Ok(())
+            })?;
+            self.pg_reload_conf()?;
         }
 
         let startup_end_time = Utc::now();
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index ef1db73982..6ef7e0837f 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -124,6 +124,7 @@ fn parse_pg_version(human_version: &str) -> &str {
             "14" => return "v14",
             "15" => return "v15",
             "16" => return "v16",
+            "17" => return "v17",
             _ => {}
         },
         _ => {}
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 5dbc3bcbbc..d616154af6 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -342,7 +342,7 @@ impl LocalEnv {
 
         #[allow(clippy::manual_range_patterns)]
         match pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
             _ => bail!("Unsupported postgres version: {}", pg_version),
         }
     }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index c715d6b789..2b714fbfbf 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -28,6 +28,7 @@ use utils::{
     auth::{encode_from_key_file, Claims, Scope},
     id::{NodeId, TenantId},
 };
+use whoami::username;
 
 pub struct StorageController {
     env: LocalEnv,
@@ -183,7 +184,7 @@ impl StorageController {
     /// to other versions if that one isn't found.  Some automated tests create circumstances
     /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
     async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];
 
         for v in prefer_versions {
             let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
@@ -211,7 +212,16 @@ impl StorageController {
     /// Readiness check for our postgres process
     async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
+        let args = [
+            "-h",
+            "localhost",
+            "-U",
+            &username(),
+            "-d",
+            DB_NAME,
+            "-p",
+            &format!("{}", postgres_port),
+        ];
         let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
 
         Ok(exitcode.success())
@@ -225,7 +235,11 @@ impl StorageController {
     ///
     /// Returns the database url
     pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
+        let database_url = format!(
+            "postgresql://{}@localhost:{}/{DB_NAME}",
+            &username(),
+            postgres_port
+        );
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let createdb_path = pg_bin_dir.join("createdb");
@@ -235,6 +249,10 @@ impl StorageController {
                 "localhost",
                 "-p",
                 &format!("{}", postgres_port),
+                "-U",
+                &username(),
+                "-O",
+                &username(),
                 DB_NAME,
             ])
             .output()
@@ -271,7 +289,7 @@ impl StorageController {
             // But tokio-postgres fork doesn't have this upstream commit:
             // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
             // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
-            .user(&whoami::username())
+            .user(&username())
             .dbname(DB_NAME)
             .connect(tokio_postgres::NoTls)
             .await
@@ -328,6 +346,12 @@ impl StorageController {
             let pg_log_path = pg_data_path.join("postgres.log");
 
             if !tokio::fs::try_exists(&pg_data_path).await? {
+                let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
+                tracing::info!(
+                    "Initializing storage controller database with args: {:?}",
+                    initdb_args
+                );
+
                 // Initialize empty database
                 let initdb_path = pg_bin_dir.join("initdb");
                 let mut child = Command::new(&initdb_path)
@@ -335,7 +359,7 @@ impl StorageController {
                         ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                         ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                     ])
-                    .args(["-D", pg_data_path.as_ref()])
+                    .args(initdb_args)
                     .spawn()
                     .expect("Failed to spawn initdb");
                 let status = child.wait().await?;
@@ -364,8 +388,14 @@ impl StorageController {
                 pg_data_path.as_ref(),
                 "-l",
                 pg_log_path.as_ref(),
+                "-U",
+                &username(),
                 "start",
             ];
+            tracing::info!(
+                "Starting storage controller database with args: {:?}",
+                db_start_args
+            );
 
             background_process::start_process(
                 "storage_controller_db",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 8929ccb41d..4a776709c9 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,8 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::Oid;
 use postgres_ffi::RepOriginId;
-use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
 
@@ -350,7 +350,17 @@ impl Key {
 // 02 00000000 00000000 00000000 00   00000000
 //
 // TwoPhaseFile:
-// 02 00000000 00000000 00000000 00   XID
+//
+// 02 00000000 00000000 00XXXXXX XX   XXXXXXXX
+//
+//                        \______XID_________/
+//
+// The 64-bit XID is stored a little awkwardly in field6, field5 and
+// field4. PostgreSQL v16 and below only stored a 32-bit XID, which
+// fit completely in field6, but starting with PostgreSQL v17, a full
+// 64-bit XID is used. Most pageserver code that accesses
+// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits
+// are just unused.
 //
 // ControlFile:
 // 03 00000000 00000000 00000000 00   00000000
@@ -582,35 +592,36 @@ pub const TWOPHASEDIR_KEY: Key = Key {
 };
 
 #[inline(always)]
-pub fn twophase_file_key(xid: TransactionId) -> Key {
+pub fn twophase_file_key(xid: u64) -> Key {
     Key {
         field1: 0x02,
         field2: 0,
         field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
+        field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
+        field5: ((xid & 0x000000FF00000000) >> 32) as u8,
+        field6: (xid & 0x00000000FFFFFFFF) as u32,
     }
 }
 
 #[inline(always)]
-pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
+pub fn twophase_key_range(xid: u64) -> Range<Key> {
+    // 64-bit XIDs really should not overflow
     let (next_xid, overflowed) = xid.overflowing_add(1);
 
     Key {
         field1: 0x02,
         field2: 0,
         field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
+        field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
+        field5: ((xid & 0x000000FF00000000) >> 32) as u8,
+        field6: (xid & 0x00000000FFFFFFFF) as u32,
     }..Key {
         field1: 0x02,
         field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: u8::from(overflowed),
-        field6: next_xid,
+        field3: u32::from(overflowed),
+        field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32,
+        field5: ((next_xid & 0x000000FF00000000) >> 32) as u8,
+        field6: (next_xid & 0x00000000FFFFFFFF) as u32,
     }
 }
 
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index a346390f3d..d3a85f2683 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
         PathBuf::from("pg_install")
     };
 
-    for pg_version in &["v14", "v15", "v16"] {
+    for pg_version in &["v14", "v15", "v16", "v17"] {
         let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
         if pg_install_dir_versioned.is_relative() {
             let cwd = env::current_dir().context("Failed to get current_dir")?;
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index f18e0c603b..0d46ed6aac 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -57,6 +57,7 @@ macro_rules! for_all_postgres_versions {
         $macro!(v14);
         $macro!(v15);
         $macro!(v16);
+        $macro!(v17);
     };
 }
 
@@ -91,6 +92,7 @@ macro_rules! dispatch_pgversion {
                 14 : v14,
                 15 : v15,
                 16 : v16,
+                17 : v17,
             ]
         )
     };
@@ -121,6 +123,7 @@ macro_rules! enum_pgversion_dispatch {
                 V14 : v14,
                 V15 : v15,
                 V16 : v16,
+                V17 : v17,
             ]
         )
     };
@@ -150,6 +153,7 @@ macro_rules! enum_pgversion {
                 V14 : v14,
                 V15 : v15,
                 V16 : v16,
+                V17 : v17,
             ]
         }
     };
@@ -162,6 +166,7 @@ macro_rules! enum_pgversion {
                 V14 : v14,
                 V15 : v15,
                 V16 : v16,
+                V17 : v17,
             ]
         }
     };
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 6ce855c78e..61b49a634d 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -152,6 +152,9 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 
+// From heapam_xlog.h
+pub const XLOG_HEAP2_REWRITE: u8 = 0x00;
+
 // From replication/message.h
 pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
 
@@ -219,15 +222,20 @@ pub const INVALID_TRANSACTION_ID: u32 = 0;
 pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
 pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
 
+/* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
-pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
-pub const XLP_LONG_HEADER: u16 = 0x0002;
+pub const XLOG_PARAMETER_CHANGE: u8 = 0x60;
+pub const XLOG_END_OF_RECOVERY: u8 = 0x90;
 
 /* From xlog.h */
 pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
 pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
 
+/* xlog_internal.h */
+pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
+pub const XLP_LONG_HEADER: u16 = 0x0002;
+
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
    + 64 /* NameData */  + 4*4;
@@ -245,33 +253,6 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
 /* From origin.c */
 pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
 
-// List of subdirectories inside pgdata.
-// Copied from src/bin/initdb/initdb.c
-pub const PGDATA_SUBDIRS: [&str; 22] = [
-    "global",
-    "pg_wal/archive_status",
-    "pg_commit_ts",
-    "pg_dynshmem",
-    "pg_notify",
-    "pg_serial",
-    "pg_snapshots",
-    "pg_subtrans",
-    "pg_twophase",
-    "pg_multixact",
-    "pg_multixact/members",
-    "pg_multixact/offsets",
-    "base",
-    "base/1",
-    "pg_replslot",
-    "pg_tblspc",
-    "pg_stat",
-    "pg_stat_tmp",
-    "pg_xact",
-    "pg_logical",
-    "pg_logical/snapshots",
-    "pg_logical/mappings",
-];
-
 // Don't include postgresql.conf as it is inconvenient on node start:
 // we need postgresql.conf before basebackup to synchronize safekeepers
 // so no point in overwriting it during backup restore. Rest of the files
diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs
index 32f8f51114..fe01a5df7c 100644
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -5,6 +5,33 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
 pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
 
+// List of subdirectories inside pgdata.
+// Copied from src/bin/initdb/initdb.c
+pub const PGDATA_SUBDIRS: [&str; 22] = [
+    "global",
+    "pg_wal/archive_status",
+    "pg_commit_ts",
+    "pg_dynshmem",
+    "pg_notify",
+    "pg_serial",
+    "pg_snapshots",
+    "pg_subtrans",
+    "pg_twophase",
+    "pg_multixact",
+    "pg_multixact/members",
+    "pg_multixact/offsets",
+    "base",
+    "base/1",
+    "pg_replslot",
+    "pg_tblspc",
+    "pg_stat",
+    "pg_stat_tmp",
+    "pg_xact",
+    "pg_logical",
+    "pg_logical/snapshots",
+    "pg_logical/mappings",
+];
+
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
     (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
 }
diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs
index 626a23c7ea..3cd1b7aec5 100644
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
 
 pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
 
+pub use super::super::v14::bindings::PGDATA_SUBDIRS;
+
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
     const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
 
diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs
index 587be71cb3..31bd5b68fd 100644
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
 
 pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
 
+pub use super::super::v14::bindings::PGDATA_SUBDIRS;
+
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
     const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
 
diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs
new file mode 100644
index 0000000000..2132938680
--- /dev/null
+++ b/libs/postgres_ffi/src/pg_constants_v17.rs
@@ -0,0 +1,55 @@
+pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
+
+pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
+pub const XLOG_DBASE_DROP: u8 = 0x20;
+
+pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
+pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
+
+// List of subdirectories inside pgdata.
+// Copied from src/bin/initdb/initdb.c
+pub const PGDATA_SUBDIRS: [&str; 23] = [
+    "global",
+    "pg_wal/archive_status",
+    "pg_wal/summaries",
+    "pg_commit_ts",
+    "pg_dynshmem",
+    "pg_notify",
+    "pg_serial",
+    "pg_snapshots",
+    "pg_subtrans",
+    "pg_twophase",
+    "pg_multixact",
+    "pg_multixact/members",
+    "pg_multixact/offsets",
+    "base",
+    "base/1",
+    "pg_replslot",
+    "pg_tblspc",
+    "pg_stat",
+    "pg_stat_tmp",
+    "pg_xact",
+    "pg_logical",
+    "pg_logical/snapshots",
+    "pg_logical/mappings",
+];
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
+
+
+pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10;
+pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20;
+pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30;
+
+
+pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0;
+pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0;
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 6052f04d11..949e3f4251 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -53,7 +53,7 @@ impl Conf {
 
         #[allow(clippy::manual_range_patterns)]
         match self.pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))),
             _ => bail!("Unsupported postgres version: {}", self.pg_version),
         }
     }
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 28547f52bf..3f549889b8 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -5,6 +5,8 @@ use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
 
+const WALPROPOSER_PG_VERSION: &str = "v17";
+
 fn main() -> anyhow::Result<()> {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
     println!("cargo:rerun-if-changed=bindgen_deps.h");
@@ -36,7 +38,10 @@ fn main() -> anyhow::Result<()> {
     // Rebuild crate when libwalproposer.a changes
     println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a");
 
-    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
+    let pg_config_bin = pg_install_abs
+        .join(WALPROPOSER_PG_VERSION)
+        .join("bin")
+        .join("pg_config");
     let inc_server_path: String = if pg_config_bin.exists() {
         let output = Command::new(pg_config_bin)
             .arg("--includedir-server")
@@ -53,7 +58,7 @@ fn main() -> anyhow::Result<()> {
             .into()
     } else {
         let server_path = pg_install_abs
-            .join("v16")
+            .join(WALPROPOSER_PG_VERSION)
             .join("include")
             .join("postgresql")
             .join("server")
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index a07107753e..adc090823d 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -79,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
         return None;
     }
     let keys: Vec<&str> = split[0].split('-').collect();
-    let mut lsns: Vec<&str> = split[1].split('-').collect();
-    let is_delta = if lsns.len() == 1 {
-        lsns.push(lsns[0]);
+    let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
+    let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
+    let the_lsns: [&str; 2];
+
+    /*
+     * Generations add a -vX-XXXXXX postfix, which causes issues when we try to
+     * parse 'vX' as an LSN.
+     */
+    let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
+        the_lsns = [lsns[0], lsns[0]];
         false
     } else {
+        the_lsns = [lsns[0], lsns[1]];
         true
     };
 
     let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
     let holes = Vec::new();
     Some(LayerFile {
         key_range,
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 207f781e1b..a32d09f3b3 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
-use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
+use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
-use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
@@ -255,8 +254,11 @@ where
 
         let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
 
+        let pgversion = self.timeline.pg_version;
+        let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);
+
         // Create pgdata subdirs structure
-        for dir in PGDATA_SUBDIRS.iter() {
+        for dir in subdirs.iter() {
             let header = new_tar_header_dir(dir)?;
             self.ar
                 .append(&header, &mut io::empty())
@@ -606,7 +608,7 @@ where
     //
     // Extract twophase state files
     //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
+    async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> {
         let img = self
             .timeline
             .get_twophase_file(xid, self.lsn, self.ctx)
@@ -617,7 +619,11 @@ where
         buf.extend_from_slice(&img[..]);
         let crc = crc32c::crc32c(&img[..]);
         buf.put_u32_le(crc);
-        let path = format!("pg_twophase/{:>08X}", xid);
+        let path = if self.timeline.pg_version < 17 {
+            format!("pg_twophase/{:>08X}", xid)
+        } else {
+            format!("pg_twophase/{:>016X}", xid)
+        };
         let header = new_tar_header(&path, buf.len() as u64)?;
         self.ar
             .append(&header, &buf[..])
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 29a98855d3..e9f197ec2d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -281,7 +281,7 @@ impl PageServerConf {
 
         #[allow(clippy::manual_range_patterns)]
         match pg_version {
-            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
             _ => bail!("Unsupported postgres version: {}", pg_version),
         }
     }
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 5a0894cd1b..ca87f1d080 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -580,9 +580,11 @@ async fn import_file(
         import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
-        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
-
         let bytes = read_all_bytes(reader).await?;
+
+        // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions,
+        // it's a 32-bit TransactionId, which fits in u64 anyway.
+        let xid = u64::from_str_radix(file_name.as_ref(), 16)?;
         modification
             .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
             .await?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6dd8851b13..5f8766ca2c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -633,7 +633,7 @@ impl Timeline {
 
     pub(crate) async fn get_twophase_file(
         &self,
-        xid: TransactionId,
+        xid: u64,
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
@@ -646,11 +646,19 @@ impl Timeline {
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<HashSet<TransactionId>, PageReconstructError> {
+    ) -> Result<HashSet<u64>, PageReconstructError> {
         // fetch directory entry
         let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
-        Ok(TwoPhaseDirectory::des(&buf)?.xids)
+        if self.pg_version >= 17 {
+            Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
+        } else {
+            Ok(TwoPhaseDirectory::des(&buf)?
+                .xids
+                .iter()
+                .map(|x| u64::from(*x))
+                .collect())
+        }
     }
 
     pub(crate) async fn get_control_file(
@@ -902,9 +910,13 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
-        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
+
+        let mut xids: Vec<u64> = self
+            .list_twophase_files(lsn, ctx)
+            .await?
+            .iter()
+            .cloned()
+            .collect();
         xids.sort_unstable();
         for xid in xids {
             result.add_key(twophase_file_key(xid));
@@ -1127,9 +1139,15 @@ impl<'a> DatadirModification<'a> {
         // Create AuxFilesDirectory
         self.init_aux_dir()?;
 
-        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
-            xids: HashSet::new(),
-        })?;
+        let buf = if self.tline.pg_version >= 17 {
+            TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
+                xids: HashSet::new(),
+            })
+        } else {
+            TwoPhaseDirectory::ser(&TwoPhaseDirectory {
+                xids: HashSet::new(),
+            })
+        }?;
         self.pending_directory_entries
             .push((DirectoryKind::TwoPhase, 0));
         self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
@@ -1321,22 +1339,31 @@ impl<'a> DatadirModification<'a> {
 
     pub async fn put_twophase_file(
         &mut self,
-        xid: TransactionId,
+        xid: u64,
         img: Bytes,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let mut dir = TwoPhaseDirectory::des(&buf)?;
-        if !dir.xids.insert(xid) {
-            anyhow::bail!("twophase file for xid {} already exists", xid);
-        }
-        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, dir.xids.len()));
-        self.put(
-            TWOPHASEDIR_KEY,
-            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
-        );
+        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
+        let newdirbuf = if self.tline.pg_version >= 17 {
+            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
+            if !dir.xids.insert(xid) {
+                anyhow::bail!("twophase file for xid {} already exists", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
+        } else {
+            let xid = xid as u32;
+            let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
+            if !dir.xids.insert(xid) {
+                anyhow::bail!("twophase file for xid {} already exists", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
+        };
+        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
 
         self.put(twophase_file_key(xid), Value::Image(img));
         Ok(())
@@ -1639,22 +1666,32 @@ impl<'a> DatadirModification<'a> {
     /// This method is used for marking truncated SLRU files
     pub async fn drop_twophase_file(
         &mut self,
-        xid: TransactionId,
+        xid: u64,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let mut dir = TwoPhaseDirectory::des(&buf)?;
+        let newdirbuf = if self.tline.pg_version >= 17 {
+            let mut dir = TwoPhaseDirectoryV17::des(&buf)?;
 
-        if !dir.xids.remove(&xid) {
-            warn!("twophase file for xid {} does not exist", xid);
-        }
-        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, dir.xids.len()));
-        self.put(
-            TWOPHASEDIR_KEY,
-            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
-        );
+            if !dir.xids.remove(&xid) {
+                warn!("twophase file for xid {} does not exist", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
+        } else {
+            let xid: u32 = u32::try_from(xid)?;
+            let mut dir = TwoPhaseDirectory::des(&buf)?;
+
+            if !dir.xids.remove(&xid) {
+                warn!("twophase file for xid {} does not exist", xid);
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
+        };
+        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
 
         // Delete it
         self.delete(twophase_key_range(xid));
@@ -2124,11 +2161,21 @@ struct DbDirectory {
     dbdirs: HashMap<(Oid, Oid), bool>,
 }
 
+// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
+// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs.  Previously, the files
+// were named like "pg_twophase/000002E5", now they're like
+// "pg_twophsae/0000000A000002E4".
+
 #[derive(Debug, Serialize, Deserialize)]
 struct TwoPhaseDirectory {
     xids: HashSet<TransactionId>,
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+struct TwoPhaseDirectoryV17 {
+    xids: HashSet<u64>,
+}
+
 #[derive(Debug, Serialize, Deserialize, Default)]
 struct RelDirectory {
     // Set of relations that exist. (relfilenode, forknum)
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 6e15ad81c3..229c01a681 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -237,6 +237,26 @@ impl WalIngest {
                                 .await?;
                         }
                     }
+                } else if pg_version == 17 {
+                    if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                        // The XLOG record was renamed between v14 and v15,
+                        // but the record format is the same.
+                        // So we can reuse XlCreateDatabase here.
+                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
+                        let createdb = XlCreateDatabase::decode(&mut buf);
+                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                            .await?;
+                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                        let dropdb = XlDropDatabase::decode(&mut buf);
+                        for tablespace_id in dropdb.tablespace_ids {
+                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
+                            modification
+                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                                .await?;
+                        }
+                    }
                 }
             }
             pg_constants::RM_TBLSPC_ID => {
@@ -246,7 +266,11 @@ impl WalIngest {
                 let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
 
                 if info == pg_constants::CLOG_ZEROPAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                     let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                     self.put_slru_page_image(
@@ -260,7 +284,7 @@ impl WalIngest {
                     .await?;
                 } else {
                     assert!(info == pg_constants::CLOG_TRUNCATE);
-                    let xlrec = XlClogTruncate::decode(&mut buf);
+                    let xlrec = XlClogTruncate::decode(&mut buf, pg_version);
                     self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                         .await?;
                 }
@@ -299,12 +323,21 @@ impl WalIngest {
                         parsed_xact.xid,
                         lsn,
                     );
-                    modification
-                        .drop_twophase_file(parsed_xact.xid, ctx)
-                        .await?;
+
+                    let xid: u64 = if pg_version >= 17 {
+                        self.adjust_to_full_transaction_id(parsed_xact.xid)?
+                    } else {
+                        parsed_xact.xid as u64
+                    };
+                    modification.drop_twophase_file(xid, ctx).await?;
                 } else if info == pg_constants::XLOG_XACT_PREPARE {
+                    let xid: u64 = if pg_version >= 17 {
+                        self.adjust_to_full_transaction_id(decoded.xl_xid)?
+                    } else {
+                        decoded.xl_xid as u64
+                    };
                     modification
-                        .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
+                        .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx)
                         .await?;
                 }
             }
@@ -312,7 +345,11 @@ impl WalIngest {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
                 if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                     let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                     self.put_slru_page_image(
@@ -325,7 +362,11 @@ impl WalIngest {
                     )
                     .await?;
                 } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
-                    let pageno = buf.get_u32_le();
+                    let pageno = if pg_version < 17 {
+                        buf.get_u32_le()
+                    } else {
+                        buf.get_u64_le() as u32
+                    };
                     let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                     self.put_slru_page_image(
@@ -354,6 +395,20 @@ impl WalIngest {
             pg_constants::RM_XLOG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
+                if info == pg_constants::XLOG_PARAMETER_CHANGE {
+                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                        let rec = v17::XlParameterChange::decode(&mut buf);
+                        cp.wal_level = rec.wal_level;
+                        self.checkpoint_modified = true;
+                    }
+                } else if info == pg_constants::XLOG_END_OF_RECOVERY {
+                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                        let rec = v17::XlEndOfRecovery::decode(&mut buf);
+                        cp.wal_level = rec.wal_level;
+                        self.checkpoint_modified = true;
+                    }
+                }
+
                 enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
                     if info == pg_constants::XLOG_NEXTOID {
                         let next_oid = buf.get_u32_le();
@@ -397,12 +452,24 @@ impl WalIngest {
                         if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
                             && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
                         {
-                            let mut oldest_active_xid = cp.nextXid.value as u32;
-                            for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                                if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                    oldest_active_xid = xid;
+                            let oldest_active_xid = if pg_version >= 17 {
+                                let mut oldest_active_full_xid = cp.nextXid.value;
+                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                    if xid < oldest_active_full_xid {
+                                        oldest_active_full_xid = xid;
+                                    }
                                 }
-                            }
+                                oldest_active_full_xid as u32
+                            } else {
+                                let mut oldest_active_xid = cp.nextXid.value as u32;
+                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                    let narrow_xid = xid as u32;
+                                    if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                        oldest_active_xid = narrow_xid;
+                                    }
+                                }
+                                oldest_active_xid
+                            };
                             cp.oldestActiveXid = oldest_active_xid;
                         } else {
                             cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
@@ -515,6 +582,25 @@ impl WalIngest {
         Ok(modification.len() > prev_len)
     }
 
+    /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+        let next_full_xid =
+            enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
+
+        let next_xid = (next_full_xid) as u32;
+        let mut epoch = (next_full_xid >> 32) as u32;
+
+        if xid > next_xid {
+            // Wraparound occurred, must be from a prev epoch.
+            if epoch == 0 {
+                bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}");
+            }
+            epoch -= 1;
+        }
+
+        Ok((epoch as u64) << 32 | xid as u64)
+    }
+
     /// Do not store this block, but observe it for the purposes of updating our relation size state.
     async fn observe_decoded_block(
         &mut self,
@@ -815,6 +901,73 @@ impl WalIngest {
                     bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                 }
             }
+            17 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v17::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v17::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v17::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v17::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
             _ => {}
         }
 
@@ -923,26 +1076,26 @@ impl WalIngest {
         assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
 
         match pg_version {
-            16 => {
+            16 | 17 => {
                 let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
 
                 match info {
                     pg_constants::XLOG_NEON_HEAP_INSERT => {
-                        let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
                         assert_eq!(0, buf.remaining());
                         if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                             new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_DELETE => {
-                        let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
                         if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                             new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_UPDATE
                     | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
-                        let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
                         // the size of tuple data is inferred from the size of the record.
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
@@ -958,7 +1111,7 @@ impl WalIngest {
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
-                        let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
 
                         let offset_array_len =
                             if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
@@ -974,7 +1127,7 @@ impl WalIngest {
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
+                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
                         if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
                             old_heap_blkno = Some(decoded.blocks[0].blkno);
                             flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 0c4d575de8..dd199e2c55 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -174,6 +174,7 @@ impl DecodedWALRecord {
                 }
                 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                 _ => {
                     panic!("Unsupported postgres version {pg_version}")
                 }
@@ -341,16 +342,47 @@ pub mod v14 {
             }
         }
     }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlParameterChange {
+        pub max_connections: i32,
+        pub max_worker_processes: i32,
+        pub max_wal_senders: i32,
+        pub max_prepared_xacts: i32,
+        pub max_locks_per_xact: i32,
+        pub wal_level: i32,
+        pub wal_log_hints: bool,
+        pub track_commit_timestamp: bool,
+        pub _padding: [u8; 2],
+    }
+
+    impl XlParameterChange {
+        pub fn decode(buf: &mut Bytes) -> XlParameterChange {
+            XlParameterChange {
+                max_connections: buf.get_i32_le(),
+                max_worker_processes: buf.get_i32_le(),
+                max_wal_senders: buf.get_i32_le(),
+                max_prepared_xacts: buf.get_i32_le(),
+                max_locks_per_xact: buf.get_i32_le(),
+                wal_level: buf.get_i32_le(),
+                wal_log_hints: buf.get_u8() != 0,
+                track_commit_timestamp: buf.get_u8() != 0,
+                _padding: [buf.get_u8(), buf.get_u8()],
+            }
+        }
+    }
 }
 
 pub mod v15 {
     pub use super::v14::{
         XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
+        XlParameterChange,
     };
 }
 
 pub mod v16 {
-    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
+    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange};
     use bytes::{Buf, Bytes};
     use postgres_ffi::{OffsetNumber, TransactionId};
 
@@ -529,6 +561,37 @@ pub mod v16 {
     }
 }
 
+pub mod v17 {
+    pub use super::v14::XlHeapLockUpdated;
+    use bytes::{Buf, Bytes};
+    pub use postgres_ffi::{TimeLineID, TimestampTz};
+
+    pub use super::v16::rm_neon;
+    pub use super::v16::{
+        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
+    };
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlEndOfRecovery {
+        pub end_time: TimestampTz,
+        pub this_time_line_id: TimeLineID,
+        pub prev_time_line_id: TimeLineID,
+        pub wal_level: i32,
+    }
+
+    impl XlEndOfRecovery {
+        pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery {
+            XlEndOfRecovery {
+                end_time: buf.get_i64_le(),
+                this_time_line_id: buf.get_u32_le(),
+                prev_time_line_id: buf.get_u32_le(),
+                wal_level: buf.get_i32_le(),
+            }
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrCreate {
@@ -746,9 +809,13 @@ pub struct XlClogTruncate {
 }
 
 impl XlClogTruncate {
-    pub fn decode(buf: &mut Bytes) -> XlClogTruncate {
+    pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate {
         XlClogTruncate {
-            pageno: buf.get_u32_le(),
+            pageno: if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            },
             oldest_xid: buf.get_u32_le(),
             oldest_xid_db: buf.get_u32_le(),
         }
diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h
new file mode 100644
index 0000000000..0a131816ef
--- /dev/null
+++ b/pgxn/neon/bitmap.h
@@ -0,0 +1,12 @@
+#ifndef NEON_BITMAP_H
+#define NEON_BITMAP_H
+
+/*
+ * Utilities for manipulating bits8* as bitmaps.
+ */
+
+#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
+#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
+#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
+
+#endif //NEON_BITMAP_H
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 479209a537..ab6739465b 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -27,6 +27,7 @@
 #include "pagestore_client.h"
 #include "common/hashfn.h"
 #include "pgstat.h"
+#include "port/pg_iovec.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
@@ -40,6 +41,7 @@
 #include "utils/guc.h"
 
 #include "hll.h"
+#include "bitmap.h"
 
 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
 
@@ -469,6 +471,99 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	return found;
 }
 
+/*
+ * Check if page is present in the cache.
+ * Returns true if page is found in local cache.
+ */
+int
+lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+					int nblocks, bits8 *bitmap)
+{
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	uint32		chunk_offs;
+	int			found = 0;
+	uint32		hash;
+	int			i = 0;
+
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+		return 0;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+
+	tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+	hash = get_hash_value(lfc_hash, &tag);
+	chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+
+	while (true)
+	{
+		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
+		if (LFC_ENABLED())
+		{
+			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+			if (entry != NULL)
+			{
+				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
+				{
+					if ((entry->bitmap[chunk_offs >> 5] & 
+						(1 << (chunk_offs & 31))) != 0)
+					{
+						BITMAP_SET(bitmap, i);
+						found++;
+					}
+				}
+			}
+			else
+			{
+				i += this_chunk;
+			}
+		}
+		else
+		{
+			return found;
+		}
+
+		/*
+		 * Break out of the iteration before doing expensive stuff for
+		 * a next iteration
+		 */
+		if (i + 1 >= nblocks)
+			break;
+
+		/*
+		 * Prepare for the next iteration. We don't unlock here, as that'd
+		 * probably be more expensive than the gains it'd get us.
+		 */
+		tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+		hash = get_hash_value(lfc_hash, &tag);
+		chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
+	}
+
+	LWLockRelease(lfc_lock);
+
+#if USE_ASSERT_CHECKING
+	do {
+		int count = 0;
+
+		for (int j = 0; j < nblocks; j++)
+		{
+			if (BITMAP_ISSET(bitmap, j))
+				count++;
+		}
+
+		Assert(count == found);
+	} while (false);
+#endif
+
+	return found;
+}
+
 /*
  * Evict a page (if present) from the local file cache
  */
@@ -548,91 +643,171 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 }
 
 /*
- * Try to read page from local cache.
- * Returns true if page is found in local cache.
- * In case of error local file cache is disabled (lfc->limit is set to zero).
+ * Try to read pages from local cache.
+ * Returns the number of pages read from the local cache, and sets bits in
+ * 'read' for the pages which were read. This may scribble over buffers not
+ * marked in 'read', so be careful with operation ordering.
+ *
+ * In case of error local file cache is disabled (lfc->limit is set to zero),
+ * and -1 is returned. Note that 'read' and the buffers may be touched and in
+ * an otherwise invalid state.
+ *
+ * If the mask argument is supplied, bits will be set at the offsets of pages
+ * that were present and read from the LFC.
  */
-bool
-lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		 char *buffer)
+int
+lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+				 void **buffers, BlockNumber nblocks, bits8 *mask)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 	bool		result = true;
 	uint32		hash;
 	uint64		generation;
 	uint32		entry_offset;
+	int			blocks_read = 0;
+	int			buf_offset = 0;
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
-		return false;
+		return 0;
 
 	if (!lfc_ensure_opened())
-		return false;
+		return 0;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-	hash = get_hash_value(lfc_hash, &tag);
 
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	if (!LFC_ENABLED())
+	/* 
+	 * For every chunk that has blocks we're interested in, we
+	 * 1. get the chunk header
+	 * 2. Check if the chunk actually has the blocks we're interested in
+	 * 3. Read the blocks we're looking for (in one preadv), assuming they exist
+	 * 4. Update the statistics for the read call.
+	 *
+	 * If there is an error, we do an early return.
+	 */
+	while (nblocks > 0)
 	{
+		struct iovec iov[PG_IOV_MAX];
+		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		int		iteration_hits = 0;
+		int		iteration_misses = 0;
+		Assert(blocks_in_chunk > 0);
+
+		for (int i = 0; i < blocks_in_chunk; i++)
+		{
+			iov[i].iov_base = buffers[buf_offset + i];
+			iov[i].iov_len = BLCKSZ;
+		}
+
+		tag.blockNum = blkno - chunk_offs;
+		hash = get_hash_value(lfc_hash, &tag);
+
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+		/* We can return the blocks we've read before LFC got disabled;
+		 * assuming we read any. */
+		if (!LFC_ENABLED())
+		{
+			LWLockRelease(lfc_lock);
+			return blocks_read;
+		}
+
+		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+		/* Approximate working set for the blocks assumed in this entry */
+		for (int i = 0; i < blocks_in_chunk; i++)
+		{
+			tag.blockNum = blkno + i;
+			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+		}
+
+		if (entry == NULL)
+		{
+			/* Pages are not cached */
+			lfc_ctl->misses += blocks_in_chunk;
+			pgBufferUsage.file_cache.misses += blocks_in_chunk;
+			LWLockRelease(lfc_lock);
+
+			buf_offset += blocks_in_chunk;
+			nblocks -= blocks_in_chunk;
+			blkno += blocks_in_chunk;
+
+			continue;
+		}
+
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
+		if (entry->access_count++ == 0)
+			dlist_delete(&entry->list_node);
+
+		generation = lfc_ctl->generation;
+		entry_offset = entry->offset;
+
 		LWLockRelease(lfc_lock);
-		return false;
-	}
 
-	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		for (int i = 0; i < blocks_in_chunk; i++)
+		{
+			/*
+			 * If the page is valid, we consider it "read".
+			 * All other pages will be fetched separately by the next cache
+			 */
+			if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32)))
+			{
+				BITMAP_SET(mask, buf_offset + i);
+				iteration_hits++;
+			}
+			else
+				iteration_misses++;
+		}
 
-	/* Approximate working set */
-	tag.blockNum = blkno;
-	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+		Assert(iteration_hits + iteration_misses > 0);
+
+		if (iteration_hits != 0)
+		{
+			rc = preadv(lfc_desc, iov, blocks_in_chunk,
+						((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+
+			if (rc != (BLCKSZ * blocks_in_chunk))
+			{
+				lfc_disable("read");
+				return -1;
+			}
+		}
+
+		/* Place entry to the head of LRU list */
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+		if (lfc_ctl->generation == generation)
+		{
+			CriticalAssert(LFC_ENABLED());
+			lfc_ctl->hits += iteration_hits;
+			lfc_ctl->misses += iteration_misses;
+			pgBufferUsage.file_cache.hits += iteration_hits;
+			pgBufferUsage.file_cache.misses += iteration_misses;
+			CriticalAssert(entry->access_count > 0);
+			if (--entry->access_count == 0)
+				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+		}
+		else
+		{
+			/* generation mismatch, assume error condition */
+			LWLockRelease(lfc_lock);
+			return -1;
+		}
 
-	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
-	{
-		/* Page is not cached */
-		lfc_ctl->misses += 1;
-		pgBufferUsage.file_cache.misses += 1;
 		LWLockRelease(lfc_lock);
-		return false;
-	}
-	/* Unlink entry from LRU list to pin it for the duration of IO operation */
-	if (entry->access_count++ == 0)
-		dlist_delete(&entry->list_node);
-	generation = lfc_ctl->generation;
-	entry_offset = entry->offset;
 
-	LWLockRelease(lfc_lock);
-
-	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-	if (rc != BLCKSZ)
-	{
-		lfc_disable("read");
-		return false;
+		buf_offset += blocks_in_chunk;
+		nblocks -= blocks_in_chunk;
+		blkno += blocks_in_chunk;
+		blocks_read += iteration_hits;
 	}
 
-	/* Place entry to the head of LRU list */
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	if (lfc_ctl->generation == generation)
-	{
-		CriticalAssert(LFC_ENABLED());
-		lfc_ctl->hits += 1;
-		pgBufferUsage.file_cache.hits += 1;
-		CriticalAssert(entry->access_count > 0);
-		if (--entry->access_count == 0)
-			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
-	}
-	else
-		result = false;
-
-	LWLockRelease(lfc_lock);
-
-	return result;
+	return blocks_read;
 }
 
 /*
@@ -640,20 +815,17 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
  * If cache is full then evict some other page.
  */
 void
-#if PG_MAJORVERSION_NUM < 16
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
-#else
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
-#endif
+lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		   const void *const *buffers, BlockNumber nblocks)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
 	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 	uint32		hash;
 	uint64		generation;
 	uint32		entry_offset;
+	int			buf_offset = 0;
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
@@ -661,110 +833,142 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	if (!lfc_ensure_opened())
 		return;
 
-	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-	hash = get_hash_value(lfc_hash, &tag);
 
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	if (!LFC_ENABLED())
+	/* 
+	 * For every chunk that has blocks we're interested in, we
+	 * 1. get the chunk header
+	 * 2. Check if the chunk actually has the blocks we're interested in
+	 * 3. Read the blocks we're looking for (in one preadv), assuming they exist
+	 * 4. Update the statistics for the read call.
+	 *
+	 * If there is an error, we do an early return.
+	 */
+	while (nblocks > 0)
 	{
-		LWLockRelease(lfc_lock);
-		return;
-	}
+		struct iovec iov[PG_IOV_MAX];
+		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		Assert(blocks_in_chunk > 0);
 
-	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-	if (found)
-	{
-		/*
-		 * Unlink entry from LRU list to pin it for the duration of IO
-		 * operation
-		 */
-		if (entry->access_count++ == 0)
-			dlist_delete(&entry->list_node);
-	}
-	else
-	{
-		/*
-		 * We have two choices if all cache pages are pinned (i.e. used in IO
-		 * operations):
-		 *
-		 * 1) Wait until some of this operation is completed and pages is
-		 * unpinned.
-		 *
-		 * 2) Allocate one more chunk, so that specified cache size is more
-		 * recommendation than hard limit.
-		 *
-		 * As far as probability of such event (that all pages are pinned) is
-		 * considered to be very very small: there are should be very large
-		 * number of concurrent IO operations and them are limited by
-		 * max_connections, we prefer not to complicate code and use second
-		 * approach.
-		 */
-		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
+		for (int i = 0; i < blocks_in_chunk; i++)
 		{
-			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-
-			CriticalAssert(victim->access_count == 0);
-			entry->offset = victim->offset; /* grab victim's chunk */
-			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-			neon_log(DEBUG2, "Swap file cache page");
+			iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]);
+			iov[i].iov_len = BLCKSZ;
 		}
-		else if (!dlist_is_empty(&lfc_ctl->holes))
+
+		tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+		hash = get_hash_value(lfc_hash, &tag);
+
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+		if (!LFC_ENABLED())
 		{
-			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-			uint32		offset = hole->offset;
-			bool		found;
+			LWLockRelease(lfc_lock);
+			return;
+		}
 
-			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
-			CriticalAssert(found);
+		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
 
-			lfc_ctl->used += 1;
-			entry->offset = offset;	/* reuse the hole */
+		if (found)
+		{
+			/*
+			 * Unlink entry from LRU list to pin it for the duration of IO
+			 * operation
+			 */
+			if (entry->access_count++ == 0)
+				dlist_delete(&entry->list_node);
 		}
 		else
 		{
-			lfc_ctl->used += 1;
-			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-												 * of file */
-		}
-		entry->access_count = 1;
-		entry->hash = hash;
-		memset(entry->bitmap, 0, sizeof entry->bitmap);
-	}
-
-	generation = lfc_ctl->generation;
-	entry_offset = entry->offset;
-	lfc_ctl->writes += 1;
-	LWLockRelease(lfc_lock);
-
-	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-	if (rc != BLCKSZ)
-	{
-		lfc_disable("write");
-	}
-	else
-	{
-		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-		if (lfc_ctl->generation == generation)
-		{
-			CriticalAssert(LFC_ENABLED());
-			/* Place entry to the head of LRU list */
-			CriticalAssert(entry->access_count > 0);
-			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
-
-			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
+			/*
+			 * We have two choices if all cache pages are pinned (i.e. used in IO
+			 * operations):
+			 *
+			 * 1) Wait until some of this operation is completed and pages is
+			 * unpinned.
+			 *
+			 * 2) Allocate one more chunk, so that specified cache size is more
+			 * recommendation than hard limit.
+			 *
+			 * As far as probability of such event (that all pages are pinned) is
+			 * considered to be very very small: there are should be very large
+			 * number of concurrent IO operations and them are limited by
+			 * max_connections, we prefer not to complicate code and use second
+			 * approach.
+			 */
+			if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
+			{
+				/* Cache overflow: evict least recently used chunk */
+				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+	
+				CriticalAssert(victim->access_count == 0);
+				entry->offset = victim->offset; /* grab victim's chunk */
+				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+				neon_log(DEBUG2, "Swap file cache page");
+			}
+			else if (!dlist_is_empty(&lfc_ctl->holes))
+			{
+				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
+				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
+				uint32		offset = hole->offset;
+				bool		found;
+	
+				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
+				CriticalAssert(found);
+	
+				lfc_ctl->used += 1;
+				entry->offset = offset;	/* reuse the hole */
+			}
+			else
+			{
+				lfc_ctl->used += 1;
+				entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
+													 * of file */
+			}
+			entry->access_count = 1;
+			entry->hash = hash;
+			memset(entry->bitmap, 0, sizeof entry->bitmap);
 		}
 
+		generation = lfc_ctl->generation;
+		entry_offset = entry->offset;
+		lfc_ctl->writes += blocks_in_chunk;
 		LWLockRelease(lfc_lock);
+
+		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
+					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+		if (rc != BLCKSZ * blocks_in_chunk)
+		{
+			lfc_disable("write");
+		}
+		else
+		{
+			LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+			if (lfc_ctl->generation == generation)
+			{
+				CriticalAssert(LFC_ENABLED());
+				/* Place entry to the head of LRU list */
+				CriticalAssert(entry->access_count > 0);
+				if (--entry->access_count == 0)
+					dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+
+				for (int i = 0; i < blocks_in_chunk; i++)
+				{
+					entry->bitmap[(chunk_offs + i) >> 5] |=
+						(1 << ((chunk_offs + i) & 31));
+				}
+			}
+
+			LWLockRelease(lfc_lock);
+		}
+		blkno += blocks_in_chunk;
+		buf_offset += blocks_in_chunk;
+		nblocks -= blocks_in_chunk;
 	}
 }
 
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5126c26c5d..df7000acc0 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -537,7 +537,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		/* No more polling needed; connection succeeded */
 		shard->last_connect_time = GetCurrentTimestamp();
 
+#if PG_MAJORVERSION_NUM >= 17
+		shard->wes_read = CreateWaitEventSet(NULL, 3);
+#else
 		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+#endif
 		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
 						  MyLatch, NULL);
 		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index addb6ccce6..59b97d64fe 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -6,7 +6,11 @@
 #ifndef NEON_PGVERSIONCOMPAT_H
 #define NEON_PGVERSIONCOMPAT_H
 
+#if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
+#else
+#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER)
+#endif
 
 #define RelFileInfoEquals(a, b) ( \
 	NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \
@@ -50,7 +54,7 @@
 #define CopyNRelFileInfoToBufTag(tag, rinfo) \
 	do { \
 		(tag).rnode = (rinfo); \
-	} while (false);
+	} while (false)
 
 #define BufTagGetNRelFileInfo(tag) tag.rnode
 
@@ -98,7 +102,7 @@
 		(tag).spcOid = (rinfo).spcOid; \
 		(tag).dbOid = (rinfo).dbOid; \
 		(tag).relNumber = (rinfo).relNumber; \
-	} while (false);
+	} while (false)
 
 #define BufTagGetNRelFileInfo(tag) \
 	((RelFileLocator) { \
@@ -113,4 +117,10 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif
 
+#if PG_MAJORVERSION_NUM < 17
+#define ProcNumber BackendId
+#define INVALID_PROC_NUMBER InvalidBackendId
+#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
+#endif
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 1f196d016c..4c9e40a063 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -6,8 +6,6 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * contrib/neon/pagestore_client.h
- *
  *-------------------------------------------------------------------------
  */
 #ifndef pageserver_h
@@ -187,7 +185,7 @@ extern char *nm_to_string(NeonMessage *msg);
  * API
  */
 
-typedef unsigned shardno_t;
+typedef uint16 shardno_t;
 
 typedef struct
 {
@@ -211,7 +209,7 @@ extern int  neon_protocol_version;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
-extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
+extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
 
@@ -233,8 +231,13 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber blocknum, int nbuffers, bool skipFsync);
 #endif
 
+#if PG_MAJORVERSION_NUM >=17
+extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum, int nblocks);
+#else
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
+#endif
 
 /*
  * LSN values associated with each request to the pageserver
@@ -269,19 +272,11 @@ typedef struct
 } neon_request_lsns;
 
 #if PG_MAJORVERSION_NUM < 16
-extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, char *buffer);
-extern void neon_write(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
-extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, void *buffer);
-extern void neon_write(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
 extern void neon_writeback(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum, BlockNumber nblocks);
@@ -299,17 +294,34 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
 
 /* functions for local file cache */
-#if PG_MAJORVERSION_NUM < 16
-extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-					  char *buffer);
-#else
-extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-					  const void *buffer);
-#endif
-extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer);
-extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
+					   BlockNumber blkno, const void *const *buffers,
+					   BlockNumber nblocks);
+/* returns number of blocks read, with one bit set in *read for each  */
+extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
+							BlockNumber blkno, void **buffers,
+							BlockNumber nblocks, bits8 *mask);
+
+extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno);
+extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno, int nblocks, bits8 *bitmap);
 extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);
 
+static inline bool
+lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		 void *buffer)
+{
+	bits8		rv = 0;
+	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
+}
+
+static inline void
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		  const void *buffer)
+{
+	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
+}
 
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 7f39c7d026..36538ea5e2 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -58,6 +58,7 @@
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/interrupt.h"
+#include "port/pg_iovec.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
@@ -66,6 +67,7 @@
 #include "storage/smgr.h"
 
 #include "pagestore_client.h"
+#include "bitmap.h"
 
 #if PG_VERSION_NUM >= 150000
 #include "access/xlogrecovery.h"
@@ -170,16 +172,28 @@ typedef enum PrefetchStatus
 								 * valid */
 } PrefetchStatus;
 
+/* must fit in uint8; bits 0x1 are used */
+typedef enum {
+	PRFSF_NONE	= 0x0,
+	PRFSF_SEQ	= 0x1,
+} PrefetchRequestFlags;
+
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
+	shardno_t	shard_no;
+	uint8		status;		/* see PrefetchStatus for valid values */
+	uint8		flags;		/* see PrefetchRequestFlags */
 	neon_request_lsns request_lsns;
 	NeonResponse *response;		/* may be null */
-	PrefetchStatus status;
-	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;
 
+StaticAssertDecl(sizeof(PrefetchRequest) == 64,
+				 "We prefer to have a power-of-2 size for this struct. Please"
+				 " try to find an alternative solution before reaching to"
+				 " increase the expected size here");
+
 /* prefetch buffer lookup hash table */
 
 typedef struct PrfHashEntry
@@ -251,17 +265,17 @@ typedef struct PrefetchState
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
-#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
-#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
-#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
-
 static PrefetchState *MyPState;
 
+#define GetPrfSlotNoCheck(ring_index) ( \
+	&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+)
+
 #define GetPrfSlot(ring_index) ( \
 	( \
 		AssertMacro((ring_index) < MyPState->ring_unused && \
 					(ring_index) >= MyPState->ring_last), \
-		&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+		GetPrfSlotNoCheck(ring_index) \
 	) \
 )
 
@@ -281,9 +295,17 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
+#if PG_MAJORVERSION_NUM < 17
+static void
+GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum,
+				   BlockNumber blkno, int nblocks, XLogRecPtr *lsns);
+#endif
 
-static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
-static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
+static void
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
+					  BlockNumber blkno, neon_request_lsns *output,
+					  BlockNumber nblocks, const bits8 *mask);
+static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 										  PrefetchRequest *slot);
 
 static bool
@@ -729,9 +751,9 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
-		slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
-												   slot->buftag.forkNum,
-												   slot->buftag.blockNum);
+		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+							  slot->buftag.forkNum, slot->buftag.blockNum,
+							  &slot->request_lsns, 1, NULL);
 	request.req.lsn = slot->request_lsns.request_lsn;
 	request.req.not_modified_since = slot->request_lsns.not_modified_since;
 
@@ -771,141 +793,194 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
+prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+						  BlockNumber nblocks, const bits8 *mask)
 {
-	uint64		ring_index;
+	uint64		min_ring_index;
 	PrefetchRequest req;
-	PrefetchRequest *slot;
-	PrfHashEntry *entry;
+#if USE_ASSERT_CHECKING
+	bool		any_hits = false;
+#endif
+	/* We will never read further ahead than our buffer can store. */
+	nblocks = Max(1, Min(nblocks, readahead_buffer_size));
 
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
+
 Retry:
-	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
-
-	if (entry != NULL)
+	min_ring_index = UINT64_MAX;
+	for (int i = 0; i < nblocks; i++)
 	{
-		slot = entry->slot;
-		ring_index = slot->my_ring_index;
-		Assert(slot == GetPrfSlot(ring_index));
+		PrefetchRequest *slot = NULL;
+		PrfHashEntry *entry = NULL;
+		uint64		ring_index;
+		neon_request_lsns *lsns;
+		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+			continue;
 
-		Assert(slot->status != PRFS_UNUSED);
-		Assert(MyPState->ring_last <= ring_index &&
-			   ring_index < MyPState->ring_unused);
-		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
+		if (frlsns)
+			lsns = &frlsns[i];
+		else
+			lsns = NULL;
 
-		/*
-		 * If the caller specified a request LSN to use, only accept prefetch
-		 * responses that satisfy that request.
-		 */
-		if (force_request_lsns)
-		{
-			if (!neon_prefetch_response_usable(*force_request_lsns, slot))
-			{
-				/* Wait for the old request to finish and discard it */
-				if (!prefetch_wait_for(ring_index))
-					goto Retry;
-				prefetch_set_unused(ring_index);
-				entry = NULL;
-			}
-		}
+#if USE_ASSERT_CHECKING
+		any_hits = true;
+#endif
+
+		slot = NULL;
+		entry = NULL;
+
+		req.buftag.blockNum = tag.blockNum + i;
+		entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
 
 		if (entry != NULL)
 		{
+			slot = entry->slot;
+			ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));
+
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
+			Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag));
+
 			/*
-			 * We received a prefetch for a page that was recently read and
-			 * removed from the buffers. Remove that request from the buffers.
+			 * If the caller specified a request LSN to use, only accept
+			 * prefetch responses that satisfy that request.
 			 */
-			if (slot->status == PRFS_TAG_REMAINS)
+			if (lsns)
 			{
-				prefetch_set_unused(ring_index);
-				entry = NULL;
+				if (!neon_prefetch_response_usable(lsns, slot))
+				{
+					/* Wait for the old request to finish and discard it */
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+				}
+			}
+
+			if (entry != NULL)
+			{
+				/*
+				 * We received a prefetch for a page that was recently read
+				 * and removed from the buffers. Remove that request from the
+				 * buffers.
+				 */
+				if (slot->status == PRFS_TAG_REMAINS)
+				{
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+				}
+				else
+				{
+					min_ring_index = Min(min_ring_index, ring_index);
+					/* The buffered request is good enough, return that index */
+					pgBufferUsage.prefetch.duplicates++;
+					continue;
+				}
+			}
+		}
+
+		/*
+		 * We can only leave the block above by finding that there's
+		 * no entry that can satisfy this request, either because there
+		 * was no entry, or because the entry was invalid or didn't satisfy
+		 * the LSNs provided.
+		 *
+		 * The code should've made sure to clear up the data.
+		 */
+		Assert(entry == NULL);
+		Assert(slot == NULL);
+
+		/*
+		 * If the prefetch queue is full, we need to make room by clearing the
+		 * oldest slot. If the oldest slot holds a buffer that was already
+		 * received, we can just throw it away; we fetched the page
+		 * unnecessarily in that case. If the oldest slot holds a request that
+		 * we haven't received a response for yet, we have to wait for the
+		 * response to that before we can continue. We might not have even
+		 * flushed the request to the pageserver yet, it might be just sitting
+		 * in the output buffer. In that case, we flush it and wait for the
+		 * response. (We could decide not to send it, but it's hard to abort
+		 * when the request is already in the output buffer, and 'not sending'
+		 * a prefetch request kind of goes against the principles of
+		 * prefetching)
+		 */
+		if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
+		{
+			uint64		cleanup_index = MyPState->ring_last;
+
+			slot = GetPrfSlot(cleanup_index);
+
+			Assert(slot->status != PRFS_UNUSED);
+
+			/*
+			 * If there is good reason to run compaction on the prefetch buffers,
+			 * try to do that.
+			 */
+			if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
+			{
+				Assert(slot->status == PRFS_UNUSED);
 			}
 			else
 			{
-				/* The buffered request is good enough, return that index */
-				pgBufferUsage.prefetch.duplicates++;
-				return ring_index;
+				/*
+				 * We have the slot for ring_last, so that must still be in
+				 * progress
+				 */
+				switch (slot->status)
+				{
+					case PRFS_REQUESTED:
+						Assert(MyPState->ring_receive == cleanup_index);
+						if (!prefetch_wait_for(cleanup_index))
+							goto Retry;
+						prefetch_set_unused(cleanup_index);
+						break;
+					case PRFS_RECEIVED:
+					case PRFS_TAG_REMAINS:
+						prefetch_set_unused(cleanup_index);
+						break;
+					default:
+						pg_unreachable();
+				}
 			}
 		}
-	}
-
-	/*
-	 * If the prefetch queue is full, we need to make room by clearing the
-	 * oldest slot. If the oldest slot holds a buffer that was already
-	 * received, we can just throw it away; we fetched the page unnecessarily
-	 * in that case. If the oldest slot holds a request that we haven't
-	 * received a response for yet, we have to wait for the response to that
-	 * before we can continue. We might not have even flushed the request to
-	 * the pageserver yet, it might be just sitting in the output buffer. In
-	 * that case, we flush it and wait for the response. (We could decide not
-	 * to send it, but it's hard to abort when the request is already in the
-	 * output buffer, and 'not sending' a prefetch request kind of goes
-	 * against the principles of prefetching)
-	 */
-	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
-	{
-		uint64		cleanup_index = MyPState->ring_last;
-
-		slot = GetPrfSlot(cleanup_index);
-
-		Assert(slot->status != PRFS_UNUSED);
 
 		/*
-		 * If there is good reason to run compaction on the prefetch buffers,
-		 * try to do that.
+		 * The next buffer pointed to by `ring_unused` is now definitely empty, so
+		 * we can insert the new request to it.
 		 */
-		if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
-		{
-			Assert(slot->status == PRFS_UNUSED);
-		}
-		else
-		{
-			/*
-			 * We have the slot for ring_last, so that must still be in
-			 * progress
-			 */
-			switch (slot->status)
-			{
-				case PRFS_REQUESTED:
-					Assert(MyPState->ring_receive == cleanup_index);
-					if (!prefetch_wait_for(cleanup_index))
-						goto Retry;
-					prefetch_set_unused(cleanup_index);
-					break;
-				case PRFS_RECEIVED:
-				case PRFS_TAG_REMAINS:
-					prefetch_set_unused(cleanup_index);
-					break;
-				default:
-					pg_unreachable();
-			}
-		}
+		ring_index = MyPState->ring_unused;
+
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index <= MyPState->ring_unused);
+
+		slot = GetPrfSlotNoCheck(ring_index);
+
+		Assert(slot->status == PRFS_UNUSED);
+
+		/*
+		 * We must update the slot data before insertion, because the hash
+		 * function reads the buffer tag from the slot.
+		 */
+		slot->buftag = req.buftag;
+		slot->shard_no = get_shard_number(&tag);
+		slot->my_ring_index = ring_index;
+
+		min_ring_index = Min(min_ring_index, ring_index);
+
+		prefetch_do_request(slot, lsns);
 	}
 
-	/*
-	 * The next buffer pointed to by `ring_unused` is now definitely empty, so
-	 * we can insert the new request to it.
-	 */
-	ring_index = MyPState->ring_unused;
-	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
+	Assert(any_hits);
 
-	Assert(MyPState->ring_last <= ring_index);
-
-	Assert(slot->status == PRFS_UNUSED);
-
-	/*
-	 * We must update the slot data before insertion, because the hash
-	 * function reads the buffer tag from the slot.
-	 */
-	slot->buftag = tag;
-	slot->shard_no = get_shard_number(&tag);
-	slot->my_ring_index = ring_index;
-
-	prefetch_do_request(slot, force_request_lsns);
-	Assert(slot->status == PRFS_REQUESTED);
-	Assert(MyPState->ring_last <= ring_index &&
-		   ring_index < MyPState->ring_unused);
+	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
+		   GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
+	Assert(MyPState->ring_last <= min_ring_index &&
+		   min_ring_index < MyPState->ring_unused);
 
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
@@ -921,9 +996,17 @@ Retry:
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
 
-	return ring_index;
+	return min_ring_index;
 }
 
+
+static uint64
+prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
+{
+	return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL);
+}
+
+
 /*
  * Note: this function can get canceled and use a long jump to the next catch
  * context. Take care.
@@ -1348,6 +1431,50 @@ log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std);
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+/*
+ * Wrapper around log_newpages() that makes a temporary copy of the block and
+ * WAL-logs that. This makes it safe to use while holding only a shared lock
+ * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint
+ * directly because it skips the logging if the LSN is new enough.
+ */
+static XLogRecPtr
+log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
+				  BlockNumber nblocks, Page *pages, bool page_std)
+{
+	PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID];
+	BlockNumber	blknos[XLR_MAX_BLOCK_ID];
+	Page		pageptrs[XLR_MAX_BLOCK_ID];
+	int			nregistered = 0;
+	XLogRecPtr	result = 0;
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		Page	page = copied_buffer[nregistered].data;
+		memcpy(page, pages[i], BLCKSZ);
+		pageptrs[nregistered] = page;
+		blknos[nregistered] = blkno + i;
+
+		++nregistered;
+
+		if (nregistered >= XLR_MAX_BLOCK_ID)
+		{
+			log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs,
+						 page_std);
+			nregistered = 0;
+		}
+	}
+
+	if (nregistered != 0)
+	{
+		log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs,
+					 page_std);
+	}
+
+	return ProcLastRecPtr;
+}
+#endif /* PG_MAJORVERSION_NUM >= 17 */
+
 /*
  * Is 'buffer' identical to a freshly initialized empty heap page?
  */
@@ -1361,14 +1488,160 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				  BlockNumber nblocks, const char **buffers, bool force)
+{
+#define BLOCK_BATCH_SIZE	16
+	bool		log_pages;
+	BlockNumber	batch_blockno = blocknum;
+	XLogRecPtr	lsns[BLOCK_BATCH_SIZE];
+	int			batch_size = 0;
+
+	/*
+	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
+	 * changes are not WAL-logged when the changes are made, so this is our
+	 * last chance to log them, otherwise they're lost. That's OK for
+	 * correctness, the non-logged updates are not critical. But we want to
+	 * have a reasonably up-to-date VM and FSM in the page server.
+	 */
+	log_pages = false;
+	if (force)
+	{
+		Assert(XLogInsertAllowed());
+		log_pages = true;
+	}
+	else if (XLogInsertAllowed() &&
+			 !ShutdownRequestPending &&
+			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	{
+		log_pages = true;
+	}
+
+	if (log_pages)
+	{
+		XLogRecPtr	recptr;
+		recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
+								   nblocks, (Page *) buffers, false);
+
+		for (int i = 0; i < nblocks; i++)
+			PageSetLSN(unconstify(char *, buffers[i]), recptr);
+
+		ereport(SmgrTrace,
+				(errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u "
+								 "were force logged, lsn=%X/%X",
+						blocknum, blocknum + nblocks,
+						RelFileInfoFmt(InfoFromSMgrRel(reln)),
+						forknum, LSN_FORMAT_ARGS(recptr))));
+	}
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		Page		page = (Page) buffers[i];
+		BlockNumber blkno = blocknum + i;
+		XLogRecPtr	lsn = PageGetLSN(page);
+
+		if (lsn == InvalidXLogRecPtr)
+		{
+			/*
+			 * When PostgreSQL extends a relation, it calls smgrextend() with an
+			 * all-zeros pages, and we can just ignore that in Neon. We do need to
+			 * remember the new size, though, so that smgrnblocks() returns the
+			 * right answer after the rel has been extended. We rely on the
+			 * relsize cache for that.
+			 *
+			 * A completely empty heap page doesn't need to be WAL-logged, either.
+			 * The heapam can leave such a page behind, if e.g. an insert errors
+			 * out after initializing the page, but before it has inserted the
+			 * tuple and WAL-logged the change. When we read the page from the
+			 * page server, it will come back as all-zeros. That's OK, the heapam
+			 * will initialize an all-zeros page on first use.
+			 *
+			 * In other scenarios, evicting a dirty page with no LSN is a bad
+			 * sign: it implies that the page was not WAL-logged, and its contents
+			 * will be lost when it's evicted.
+			 */
+			if (PageIsNew(page))
+			{
+				ereport(SmgrTrace,
+						(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
+								blkno,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forknum)));
+			}
+			else if (PageIsEmptyHeapPage(page))
+			{
+				ereport(SmgrTrace,
+						(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
+								blkno,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forknum)));
+			}
+			else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
+			{
+				/*
+				 * Its a bad sign if there is a page with zero LSN in the buffer
+				 * cache in a standby, too. However, PANICing seems like a cure
+				 * worse than the disease, as the damage has likely already been
+				 * done in the primary. So in a standby, make this an assertion,
+				 * and in a release build just LOG the error and soldier on. We
+				 * update the last-written LSN of the page with a conservative
+				 * value in that case, which is the last replayed LSN.
+				 */
+				ereport(RecoveryInProgress() ? LOG : PANIC,
+						(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
+								blkno,
+								RelFileInfoFmt(InfoFromSMgrRel(reln)),
+								forknum)));
+				Assert(false);
+
+				lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
+			}
+		}
+		else
+		{
+			ereport(SmgrTrace,
+					(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
+							blkno,
+							RelFileInfoFmt(InfoFromSMgrRel(reln)),
+							forknum, LSN_FORMAT_ARGS(lsn))));
+		}
+
+		/*
+		 * Remember the LSN on this page. When we read the page again, we must
+		 * read the same or newer version of it.
+		 */
+		lsns[batch_size++] = lsn;
+
+		if (batch_size >= BLOCK_BATCH_SIZE)
+		{
+			SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum,
+									   batch_blockno,
+									   batch_size);
+			batch_blockno += batch_size;
+			batch_size = 0;
+		}
+	}
+
+	if (batch_size != 0)
+	{
+		SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum,
+								   batch_blockno,
+								   batch_size);
+	}
+}
+#endif
+
 /*
  * A page is being evicted from the shared buffer cache. Update the
  * last-written LSN of the page, and WAL-log it if needed.
  */
-static void
 #if PG_MAJORVERSION_NUM < 16
+static void
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
 #else
+static void
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
 #endif
 {
@@ -1548,18 +1821,39 @@ nm_adjust_lsn(XLogRecPtr lsn)
 	return lsn;
 }
 
+
+/*
+ * Since PG17 we use vetorized version,
+ * so add compatibility function for older versions
+ */
+#if PG_MAJORVERSION_NUM < 17
+static void
+GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum,
+				   BlockNumber blkno, int nblocks, XLogRecPtr *lsns)
+{
+	lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno);
+}
+#endif
+
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static neon_request_lsns
-neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
+static void
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+					  neon_request_lsns *output, BlockNumber nblocks,
+					  const bits8 *mask)
 {
-	XLogRecPtr	last_written_lsn;
-	neon_request_lsns result;
+	XLogRecPtr	last_written_lsns[PG_IOV_MAX];
 
-	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-	last_written_lsn = nm_adjust_lsn(last_written_lsn);
-	Assert(last_written_lsn != InvalidXLogRecPtr);
+	Assert(nblocks <= PG_IOV_MAX);
+
+	GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns);
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]);
+		Assert(last_written_lsns[i] != InvalidXLogRecPtr);
+	}
 
 	if (RecoveryInProgress())
 	{
@@ -1630,95 +1924,111 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		/* Request the page at the end of the last fully replayed LSN. */
 		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
 
-		if (last_written_lsn > replay_lsn)
+		for (int i = 0; i < nblocks; i++)
 		{
-			/* GetCurrentReplayRecPtr was introduced in v15 */
+			neon_request_lsns *result = &output[i];
+			XLogRecPtr	last_written_lsn = last_written_lsns[i];
+
+			if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+				continue;
+
+			if (last_written_lsn > replay_lsn)
+			{
+				/* GetCurrentReplayRecPtr was introduced in v15 */
 #if PG_VERSION_NUM >= 150000
-			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
+				Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
 #endif
 
-			/*
-			 * Cases 2 and 4. If this is a backend (case 4), the
-			 * neon_read_at_lsn() call later will wait for the WAL record to be
-			 * fully replayed.
-			 */
-			result.request_lsn = last_written_lsn;
-		}
-		else
-		{
-			/* cases 1 and 3 */
-			result.request_lsn = replay_lsn;
-		}
-		result.not_modified_since = last_written_lsn;
-		result.effective_request_lsn = result.request_lsn;
-		Assert(last_written_lsn <= result.request_lsn);
+				/*
+				 * Cases 2 and 4. If this is a backend (case 4), the
+				 * neon_read_at_lsn() call later will wait for the WAL record to be
+				 * fully replayed.
+				 */
+				result->request_lsn = last_written_lsn;
+			}
+			else
+			{
+				/* cases 1 and 3 */
+				result->request_lsn = replay_lsn;
+			}
 
-		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
+			result->not_modified_since = last_written_lsn;
+			result->effective_request_lsn = result->request_lsn;
+			Assert(last_written_lsn <= result->request_lsn);
+
+			neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
+					 LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since));
+		}
 	}
 	else
 	{
 		XLogRecPtr	flushlsn;
-
-		/*
-		 * Use the latest LSN that was evicted from the buffer cache as the
-		 * 'not_modified_since' hint. Any pages modified by later WAL records
-		 * must still in the buffer cache, so our request cannot concern
-		 * those.
-		 */
-		neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
-				 LSN_FORMAT_ARGS(last_written_lsn));
-
-		/*
-		 * Is it possible that the last-written LSN is ahead of last flush
-		 * LSN? Generally not, we shouldn't evict a page from the buffer cache
-		 * before all its modifications have been safely flushed. That's the
-		 * "WAL before data" rule. However, such case does exist at index
-		 * building, _bt_blwritepage logs the full page without flushing WAL
-		 * before smgrextend (files are fsynced before build ends).
-		 */
 #if PG_VERSION_NUM >= 150000
 		flushlsn = GetFlushRecPtr(NULL);
 #else
 		flushlsn = GetFlushRecPtr();
 #endif
-		if (last_written_lsn > flushlsn)
+
+		for (int i = 0; i < nblocks; i++)
 		{
-			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
-					 LSN_FORMAT_ARGS(last_written_lsn),
-					 LSN_FORMAT_ARGS(flushlsn));
-			XLogFlush(last_written_lsn);
-			flushlsn = last_written_lsn;
+			neon_request_lsns *result = &output[i];
+			XLogRecPtr	last_written_lsn = last_written_lsns[i];
+
+			if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+				continue;
+			/*
+			 * Use the latest LSN that was evicted from the buffer cache as the
+			 * 'not_modified_since' hint. Any pages modified by later WAL records
+			 * must still in the buffer cache, so our request cannot concern
+			 * those.
+			 */
+			neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
+					 LSN_FORMAT_ARGS(last_written_lsn));
+
+			/*
+			 * Is it possible that the last-written LSN is ahead of last flush
+			 * LSN? Generally not, we shouldn't evict a page from the buffer cache
+			 * before all its modifications have been safely flushed. That's the
+			 * "WAL before data" rule. However, such case does exist at index
+			 * building, _bt_blwritepage logs the full page without flushing WAL
+			 * before smgrextend (files are fsynced before build ends).
+			 */
+			if (last_written_lsn > flushlsn)
+			{
+				neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+						 LSN_FORMAT_ARGS(last_written_lsn),
+						 LSN_FORMAT_ARGS(flushlsn));
+				XLogFlush(last_written_lsn);
+				flushlsn = last_written_lsn;
+			}
+
+			/*
+			 * Request the very latest version of the page. In principle we
+			 * want to read the page at the current insert LSN, and we could
+			 * use that value in the request. However, there's a corner case
+			 * with pageserver's garbage collection. If the GC horizon is
+			 * set to a very small value, it's possible that by the time
+			 * that the pageserver processes our request, the GC horizon has
+			 * already moved past the LSN we calculate here. Standby servers
+			 * always have that problem as the can always lag behind the
+			 * primary, but for the primary we can avoid it by always
+			 * requesting the latest page, by setting request LSN to
+			 * UINT64_MAX.
+			 *
+			 * Remember the current LSN, however, so that we can later
+			 * correctly determine if the response to the request is still
+			 * valid. The most up-to-date LSN we could use for that purpose
+			 * would be the current insert LSN, but to avoid the overhead of
+			 * looking it up, use 'flushlsn' instead. This relies on the
+			 * assumption that if the page was modified since the last WAL
+			 * flush, it should still be in the buffer cache, and we
+			 * wouldn't be requesting it.
+			 */
+			result->request_lsn = UINT64_MAX;
+			result->not_modified_since = last_written_lsn;
+			result->effective_request_lsn = flushlsn;
 		}
-
-		/*
-		 * Request the very latest version of the page. In principle we
-		 * want to read the page at the current insert LSN, and we could
-		 * use that value in the request. However, there's a corner case
-		 * with pageserver's garbage collection. If the GC horizon is
-		 * set to a very small value, it's possible that by the time
-		 * that the pageserver processes our request, the GC horizon has
-		 * already moved past the LSN we calculate here. Standby servers
-		 * always have that problem as the can always lag behind the
-		 * primary, but for the primary we can avoid it by always
-		 * requesting the latest page, by setting request LSN to
-		 * UINT64_MAX.
-		 *
-		 * Remember the current LSN, however, so that we can later
-		 * correctly determine if the response to the request is still
-		 * valid. The most up-to-date LSN we could use for that purpose
-		 * would be the current insert LSN, but to avoid the overhead of
-		 * looking it up, use 'flushlsn' instead. This relies on the
-		 * assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we
-		 * wouldn't be requesting it.
-		 */
-		result.request_lsn = UINT64_MAX;
-		result.not_modified_since = last_written_lsn;
-		result.effective_request_lsn = flushlsn;
 	}
-
-	return result;
 }
 
 /*
@@ -1728,13 +2038,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
  * satisfy a page read now.
  */
 static bool
-neon_prefetch_response_usable(neon_request_lsns request_lsns,
+neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 							  PrefetchRequest *slot)
 {
 	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
-	Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
-	Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
+	Assert(request_lsns->request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn);
 	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
 	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
 	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
@@ -1755,15 +2065,15 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
-		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
+	if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn ||
+		request_lsns->not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
-						   LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(request_lsns->effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns->not_modified_since),
 						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
 						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
@@ -1817,9 +2127,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 */
 
 	/* this follows from the checks above */
-	Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since);
 
-	return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
+	return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn;
 }
 
 /*
@@ -1886,7 +2196,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
@@ -2068,7 +2379,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	 */
 	if (max_cluster_size > 0 &&
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
-		!IsAutoVacuumWorkerProcess())
+		!AmAutoVacuumWorkerProcess())
 	{
 		uint64		current_size = GetNeonCurrentClusterSize();
 
@@ -2149,7 +2460,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 
 	if (max_cluster_size > 0 &&
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
-		!IsAutoVacuumWorkerProcess())
+		!AmAutoVacuumWorkerProcess())
 	{
 		uint64		current_size = GetNeonCurrentClusterSize();
 
@@ -2247,6 +2558,73 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 }
 
 
+#if PG_MAJORVERSION_NUM >= 17
+/*
+ *	neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			  int nblocks)
+{
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
+	BufferTag	tag;
+	bool		io_initiated = false;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:					/* probably shouldn't happen, but ignore it */
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdprefetch(reln, forknum, blocknum, nblocks);
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	tag.spcOid = reln->smgr_rlocator.locator.spcOid;
+	tag.dbOid = reln->smgr_rlocator.locator.dbOid;
+	tag.relNumber = reln->smgr_rlocator.locator.relNumber;
+	tag.forkNum = forknum;
+
+	while (nblocks > 0)
+	{
+		int		iterblocks = Min(nblocks, PG_IOV_MAX);
+		int		seqlen = 0;
+		bits8		lfc_present[PG_IOV_MAX / 8];
+		memset(lfc_present, 0, sizeof(lfc_present));
+
+		if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum,
+								iterblocks, lfc_present) == iterblocks)
+		{
+			nblocks -= iterblocks;
+			blocknum += iterblocks;
+			continue;
+		}
+
+		io_initiated = true;
+
+		tag.blockNum = blocknum;
+		
+		for (int i = 0; i < PG_IOV_MAX / 8; i++)
+			lfc_present[i] = ~(lfc_present[i]);
+
+		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
+											   lfc_present);
+		nblocks -= iterblocks;
+		blocknum += iterblocks;
+
+		Assert(ring_index < MyPState->ring_unused &&
+			   MyPState->ring_last <= ring_index);
+	}
+
+	return false;
+}
+
+
+#else /* PG_MAJORVERSION_NUM >= 17 */
 /*
  *	neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
  */
@@ -2285,6 +2663,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	return false;
 }
+#endif /* PG_MAJORVERSION_NUM < 17 */
+
 
 /*
  * neon_writeback() -- Tell the kernel to write pages back to storage.
@@ -2315,7 +2695,12 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	/* not implemented */
+	/*
+	 * TODO: WAL sync up to lwLsn for the indicated blocks
+	 * Without that sync, writeback doesn't actually guarantee the data is
+	 * persistently written, which does seem to be one of the assumed
+	 * properties of this smgr API call.
+	 */
 	neon_log(SmgrTrace, "writeback noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -2324,30 +2709,27 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 #endif
 }
 
-/*
- * While function is defined in the neon extension it's used within neon_test_utils directly.
- * To avoid breaking tests in the runtime please keep function signature in sync.
- */
-void
+static void
 #if PG_MAJORVERSION_NUM < 16
-neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 neon_request_lsns request_lsns, char *buffer)
+neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
+				  char **buffers, BlockNumber nblocks, const bits8 *mask)
 #else
-neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 neon_request_lsns request_lsns, void *buffer)
+neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
+				  void **buffers, BlockNumber nblocks, const bits8 *mask)
 #endif
 {
 	NeonResponse *resp;
 	uint64		ring_index;
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;
-	BufferTag	buftag =
-	{
-		.forkNum = forkNum,
-		.blockNum = blkno,
-	};
+	BufferTag	buftag = {0};
+
+	Assert(PointerIsValid(request_lsns));
+	Assert(nblocks >= 1);
 
 	CopyNRelFileInfoToBufTag(buftag, rinfo);
+	buftag.forkNum = forkNum;
+	buftag.blockNum = base_blockno;
 
 	/*
 	 * The redo process does not lock pages that it needs to replay but are
@@ -2365,115 +2747,147 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
 	 * value of the LwLsn cache when the entry is not found.
 	 */
-	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
-		XLogWaitForReplayOf(request_lsns.request_lsn);
+	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask);
 
-	/*
-	 * Try to find prefetched page in the list of received pages.
-	 */
+	for (int i = 0; i < nblocks; i++)
+	{
+		void	   *buffer = buffers[i];
+		BlockNumber blockno = base_blockno + i;
+		neon_request_lsns *reqlsns = &request_lsns[i];
+
+		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+			continue;
+
+		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
+			XLogWaitForReplayOf(reqlsns[0].request_lsn);
+
+		/*
+		 * Try to find prefetched page in the list of received pages.
+		 */
 Retry:
-	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
+		buftag.blockNum = blockno;
+		entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
 
-	if (entry != NULL)
-	{
-		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsns, slot))
+		if (entry != NULL)
 		{
-			ring_index = slot->my_ring_index;
-			pgBufferUsage.prefetch.hits += 1;
-		}
-		else
-		{
-			/*
-			 * Cannot use this prefetch, discard it
-			 *
-			 * We can't drop cache for not-yet-received requested items. It is
-			 * unlikely this happens, but it can happen if prefetch distance
-			 * is large enough and a backend didn't consume all prefetch
-			 * requests.
-			 */
-			if (slot->status == PRFS_REQUESTED)
+			slot = entry->slot;
+			if (neon_prefetch_response_usable(reqlsns, slot))
 			{
-				if (!prefetch_wait_for(slot->my_ring_index))
-					goto Retry;
+				ring_index = slot->my_ring_index;
+				pgBufferUsage.prefetch.hits += 1;
+			}
+			else
+			{
+				/*
+				 * Cannot use this prefetch, discard it
+				 *
+				 * We can't drop cache for not-yet-received requested items. It is
+				 * unlikely this happens, but it can happen if prefetch distance
+				 * is large enough and a backend didn't consume all prefetch
+				 * requests.
+				 */
+				if (slot->status == PRFS_REQUESTED)
+				{
+					if (!prefetch_wait_for(slot->my_ring_index))
+						goto Retry;
+				}
+				/* drop caches */
+				prefetch_set_unused(slot->my_ring_index);
+				pgBufferUsage.prefetch.expired += 1;
+				/* make it look like a prefetch cache miss */
+				entry = NULL;
 			}
-			/* drop caches */
-			prefetch_set_unused(slot->my_ring_index);
-			pgBufferUsage.prefetch.expired += 1;
-			/* make it look like a prefetch cache miss */
-			entry = NULL;
 		}
-	}
 
-	do
-	{
-		if (entry == NULL)
+		do
 		{
-			pgBufferUsage.prefetch.misses += 1;
+			if (entry == NULL)
+			{
+				pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_lsns);
-			slot = GetPrfSlot(ring_index);
-		}
-		else
+				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL);
+				Assert(ring_index != UINT64_MAX);
+				slot = GetPrfSlot(ring_index);
+			}
+			else
+			{
+				/*
+				 * Empty our reference to the prefetch buffer's hash entry. When
+				 * we wait for prefetches, the entry reference is invalidated by
+				 * potential updates to the hash, and when we reconnect to the
+				 * pageserver the prefetch we're waiting for may be dropped, in
+				 * which case we need to retry and take the branch above.
+				 */
+				entry = NULL;
+			}
+
+			Assert(slot->my_ring_index == ring_index);
+			Assert(MyPState->ring_last <= ring_index &&
+				   MyPState->ring_unused > ring_index);
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(GetPrfSlot(ring_index) == slot);
+
+		} while (!prefetch_wait_for(ring_index));
+
+		Assert(slot->status == PRFS_RECEIVED);
+		Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0);
+		Assert(buftag.blockNum == base_blockno + i);
+
+		resp = slot->response;
+
+		switch (resp->tag)
 		{
-			/*
-			 * Empty our reference to the prefetch buffer's hash entry. When
-			 * we wait for prefetches, the entry reference is invalidated by
-			 * potential updates to the hash, and when we reconnect to the
-			 * pageserver the prefetch we're waiting for may be dropped, in
-			 * which case we need to retry and take the branch above.
-			 */
-			entry = NULL;
+			case T_NeonGetPageResponse:
+				memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
+				lfc_write(rinfo, forkNum, blockno, buffer);
+				break;
+
+			case T_NeonErrorResponse:
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								slot->shard_no, blockno, RelFileInfoFmt(rinfo),
+								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+			default:
+				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
 		}
 
-		Assert(slot->my_ring_index == ring_index);
-		Assert(MyPState->ring_last <= ring_index &&
-			   MyPState->ring_unused > ring_index);
-		Assert(slot->status != PRFS_UNUSED);
-		Assert(GetPrfSlot(ring_index) == slot);
-
-	} while (!prefetch_wait_for(ring_index));
-
-	Assert(slot->status == PRFS_RECEIVED);
-
-	resp = slot->response;
-
-	switch (resp->tag)
-	{
-		case T_NeonGetPageResponse:
-			memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
-			lfc_write(rinfo, forkNum, blkno, buffer);
-			break;
-
-		case T_NeonErrorResponse:
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							slot->shard_no, blkno,
-							RelFileInfoFmt(rinfo),
-							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-					 errdetail("page server returned error: %s",
-							   ((NeonErrorResponse *) resp)->message)));
-			break;
-		default:
-			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
+		/* buffer was used, clean up for later reuse */
+		prefetch_set_unused(ring_index);
+		prefetch_cleanup_trailing_unused();
 	}
-
-	/* buffer was used, clean up for later reuse */
-	prefetch_set_unused(ring_index);
-	prefetch_cleanup_trailing_unused();
 }
 
 /*
- *	neon_read() -- Read the specified block from a relation.
+ * While function is defined in the neon extension it's used within neon_test_utils directly.
+ * To avoid breaking tests in the runtime please keep function signature in sync.
  */
 void
 #if PG_MAJORVERSION_NUM < 16
+neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+				 neon_request_lsns request_lsns, char *buffer)
+#else
+neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+				 neon_request_lsns request_lsns, void *buffer)
+#endif
+{
+	neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
+}
+
+#if PG_MAJORVERSION_NUM < 17
+/*
+ *	neon_read() -- Read the specified block from a relation.
+ */
+#if PG_MAJORVERSION_NUM < 16
+void
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
 #else
+void
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
@@ -2502,7 +2916,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
 	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -2578,6 +2992,148 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	}
 #endif
 }
+#endif /* PG_MAJORVERSION_NUM <= 16 */
+
+#if PG_MAJORVERSION_NUM >= 17
+void
+neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		void **buffers, BlockNumber nblocks)
+{
+	bits8		read[PG_IOV_MAX / 8];
+	neon_request_lsns request_lsns[PG_IOV_MAX];
+	int			lfc_result;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdreadv(reln, forknum, blocknum, buffers, nblocks);
+			return;
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (nblocks > PG_IOV_MAX)
+		neon_log(ERROR, "Read request too large: %d is larger than max %d",
+				 nblocks, PG_IOV_MAX);
+
+	memset(read, 0, sizeof(read));
+
+	/* Try to read from local file cache */
+	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
+								  nblocks, read);
+
+	/* Read all blocks from LFC, so we're done */
+	if (lfc_result == nblocks)
+		return;
+
+	if (lfc_result == -1)
+	{
+		/* can't use the LFC result, so read all blocks from PS */
+		for (int i = 0; i < PG_IOV_MAX / 8; i++)
+			read[i] = 0xFF;
+	}
+	else
+	{
+		/* invert the result: exclude blocks read from lfc */
+		for (int i = 0; i < PG_IOV_MAX / 8; i++)
+			read[i] = ~(read[i]);
+	}
+
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
+						  request_lsns, nblocks, read);
+
+	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
+					  buffers, nblocks, read);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
+	{
+		char		pageserver_masked[BLCKSZ];
+		char		mdbuf[BLCKSZ];
+		char		mdbuf_masked[BLCKSZ];
+
+		for (int i = 0; i < nblocks; i++)
+		{
+#if PG_MAJORVERSION_NUM >= 17
+			mdreadv(reln, forkNum, blkno + i, &mdbuf, 1);
+#else
+			mdread(reln, forkNum, blkno + i, mdbuf);
+#endif
+
+			memcpy(pageserver_masked, buffer, BLCKSZ);
+			memcpy(mdbuf_masked, mdbuf, BLCKSZ);
+
+			if (PageIsNew((Page) mdbuf))
+			{
+				if (!PageIsNew((Page) pageserver_masked))
+				{
+					neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+						 blkno,
+						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+						 forkNum,
+						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+						 hexdump_page(buffer));
+				}
+			}
+			else if (PageIsNew((Page) buffer))
+			{
+				neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+					 blkno,
+					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(mdbuf));
+			}
+			else if (PageGetSpecialSize(mdbuf) == 0)
+			{
+				/* assume heap */
+				RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
+				RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
+	
+				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+				{
+					neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+						 blkno,
+						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+						 forkNum,
+						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+						 hexdump_page(mdbuf_masked),
+						 hexdump_page(pageserver_masked));
+				}
+			}
+			else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData)))
+			{
+				if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
+				{
+					/* assume btree */
+					RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
+					RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
+	
+					if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+					{
+						neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+							 blkno,
+							 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+							 forkNum,
+							 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+							 hexdump_page(mdbuf_masked),
+							 hexdump_page(pageserver_masked));
+					}
+				}
+			}
+		}
+	}
+#endif
+}
+#endif
 
 #ifdef DEBUG_COMPARE_LOCAL
 static char *
@@ -2623,7 +3179,72 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			if (mdexists(reln, forknum))
 			{
 				/* It exists locally. Guess it's unlogged then. */
+#if PG_MAJORVERSION_NUM >= 17
+				mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+#else
 				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+				/*
+				 * We could set relpersistence now that we have determined
+				 * that it's local. But we don't dare to do it, because that
+				 * would immediately allow reads as well, which shouldn't
+				 * happen. We could cache it with a different 'relpersistence'
+				 * value, but this isn't performance critical.
+				 */
+				return;
+			}
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			#if PG_MAJORVERSION_NUM >= 17
+			mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+			#else
+			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			#endif
+			return;
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	neon_wallog_page(reln, forknum, blocknum, buffer, false);
+
+	lsn = PageGetLSN((Page) buffer);
+	neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+		 forknum, blocknum,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		#if PG_MAJORVERSION_NUM >= 17
+		mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+		#else
+		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+		#endif
+#endif
+}
+
+
+
+#if PG_MAJORVERSION_NUM >= 17
+void
+neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* This is a bit tricky. Check if the relation exists locally */
+			if (mdexists(reln, forknum))
+			{
+				/* It exists locally. Guess it's unlogged then. */
+				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
 
 				/*
 				 * We could set relpersistence now that we have determined
@@ -2641,29 +3262,24 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 
 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
 			return;
-
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	neon_wallog_page(reln, forknum, blocknum, buffer, false);
+	neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
 
-	lsn = PageGetLSN((Page) buffer);
-	neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-		 forknum, blocknum,
-		 (uint32) (lsn >> 32), (uint32) lsn);
-
-	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
+	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
-		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+		mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
 #endif
 }
 
+#endif
+
 /*
  *	neon_nblocks() -- Get the number of blocks stored in a relation.
  */
@@ -2699,7 +3315,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
+
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
@@ -2757,7 +3375,9 @@ neon_dbsize(Oid dbNode)
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
 
-	request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
+						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
+
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
@@ -2898,6 +3518,38 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 #endif
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+void
+neon_regisersync(SMgrRelation reln, ForkNumber forknum)
+{
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdregistersync(reln, forknum);
+			return;
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	neon_log(SmgrTrace, "[NEON_SMGR] registersync noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (IS_LOCAL_REL(reln))
+		mdimmedsync(reln, forknum);
+#endif
+}
+#endif
+
+
 /*
  * neon_start_unlogged_build() -- Starting build operation on a rel.
  *
@@ -3047,8 +3699,11 @@ neon_end_unlogged_build(SMgrRelation reln)
 static int
 neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
 {
-	XLogRecPtr request_lsn,
-		not_modified_since;
+	XLogRecPtr	request_lsn,
+				not_modified_since;
+	SlruKind	kind;
+	int			n_blocks;
+	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
 
 	/*
 	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
@@ -3078,32 +3733,30 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	 */
 	not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
 
-	SlruKind kind;
-
-    if (STRPREFIX(path, "pg_xact"))
-        kind = SLRU_CLOG;
-    else if (STRPREFIX(path, "pg_multixact/members"))
-        kind = SLRU_MULTIXACT_MEMBERS;
-    else if (STRPREFIX(path, "pg_multixact/offsets"))
-        kind = SLRU_MULTIXACT_OFFSETS;
-    else
-        return -1;
+	if (STRPREFIX(path, "pg_xact"))
+		kind = SLRU_CLOG;
+	else if (STRPREFIX(path, "pg_multixact/members"))
+		kind = SLRU_MULTIXACT_MEMBERS;
+	else if (STRPREFIX(path, "pg_multixact/offsets"))
+		kind = SLRU_MULTIXACT_OFFSETS;
+	else
+		return -1;
 
 	NeonResponse *resp;
 	NeonGetSlruSegmentRequest request = {
 		.req.tag = T_NeonGetSlruSegmentRequest,
 		.req.lsn = request_lsn,
 		.req.not_modified_since = not_modified_since,
-
 		.kind = kind,
 		.segno = segno
 	};
-	int n_blocks;
-	shardno_t shard_no = 0; /* All SLRUs are at shard 0 */
+
 	do
 	{
 		while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
+
 		consume_prefetch_responses();
+
 		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 
@@ -3182,14 +3835,23 @@ static const struct f_smgr neon_smgr =
 #if PG_MAJORVERSION_NUM >= 16
 	.smgr_zeroextend = neon_zeroextend,
 #endif
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_prefetch = neon_prefetch,
+	.smgr_readv = neon_readv,
+	.smgr_writev = neon_writev,
+#else
 	.smgr_prefetch = neon_prefetch,
 	.smgr_read = neon_read,
 	.smgr_write = neon_write,
+#endif
+
 	.smgr_writeback = neon_writeback,
 	.smgr_nblocks = neon_nblocks,
 	.smgr_truncate = neon_truncate,
 	.smgr_immedsync = neon_immedsync,
-
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_registersync = neon_regisersync,
+#endif
 	.smgr_start_unlogged_build = neon_start_unlogged_build,
 	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
 	.smgr_end_unlogged_build = neon_end_unlogged_build,
@@ -3198,11 +3860,11 @@ static const struct f_smgr neon_smgr =
 };
 
 const f_smgr *
-smgr_neon(BackendId backend, NRelFileInfo rinfo)
+smgr_neon(ProcNumber backend, NRelFileInfo rinfo)
 {
 
 	/* Don't use page server for temp relations */
-	if (backend != InvalidBackendId)
+	if (backend != INVALID_PROC_NUMBER)
 		return smgr_standard(backend, rinfo);
 	else
 		return &neon_smgr;
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 65ef588ba5..4d0d06e6de 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -81,6 +81,7 @@ static void nwp_register_gucs(void);
 static void assign_neon_safekeepers(const char *newval, void *extra);
 static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
+static uint64 startup_backpressure_wrap(void);
 static bool backpressure_throttling_impl(void);
 static void walprop_register_bgworker(void);
 
@@ -90,7 +91,7 @@ static void walprop_pg_init_bgworker(void);
 static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
 static void walprop_pg_load_libpqwalreceiver(void);
 
-static process_interrupts_callback_t PrevProcessInterruptsCallback;
+static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL;
 static shmem_startup_hook_type prev_shmem_startup_hook_type;
 #if PG_VERSION_NUM >= 150000
 static shmem_request_hook_type prev_shmem_request_hook = NULL;
@@ -178,7 +179,7 @@ pg_init_walproposer(void)
 
 	nwp_prepare_shmem();
 
-	delay_backend_us = &backpressure_lag_impl;
+	delay_backend_us = &startup_backpressure_wrap;
 	PrevProcessInterruptsCallback = ProcessInterruptsCallback;
 	ProcessInterruptsCallback = backpressure_throttling_impl;
 
@@ -352,6 +353,22 @@ backpressure_lag_impl(void)
 	return 0;
 }
 
+/*
+ * We don't apply backpressure when we're the postmaster, or the startup
+ * process, because in postmaster we can't apply backpressure, and in
+ * the startup process we can't afford to slow down.
+ */
+static uint64
+startup_backpressure_wrap(void)
+{
+	if (AmStartupProcess() || !IsUnderPostmaster)
+		return 0;
+
+	delay_backend_us = &backpressure_lag_impl;
+
+	return backpressure_lag_impl();
+}
+
 /*
  * WalproposerShmemSize --- report amount of shared memory space needed
  */
@@ -401,12 +418,13 @@ WalproposerShmemInit_SyncSafekeeper(void)
 static bool
 backpressure_throttling_impl(void)
 {
-	int64		lag;
+	uint64		lag;
 	TimestampTz start,
 				stop;
-	bool		retry = PrevProcessInterruptsCallback
-		? PrevProcessInterruptsCallback()
-		: false;
+	bool		retry = false;
+
+	if (PointerIsValid(PrevProcessInterruptsCallback))
+		retry = PrevProcessInterruptsCallback();
 
 	/*
 	 * Don't throttle read only transactions or wal sender. Do throttle CREATE
@@ -602,7 +620,12 @@ walprop_pg_init_walsender(void)
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
 	{
+#if PG_MAJORVERSION_NUM >= 17
+		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT,
+							  false, false, false);
+#else
 		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+#endif
 		ReplicationSlotReserveWal();
 		/* Write this slot to disk */
 		ReplicationSlotMarkDirty();
@@ -1509,7 +1532,11 @@ walprop_pg_init_event_set(WalProposer *wp)
 		wpg_log(FATAL, "double-initialization of event set");
 
 	/* for each sk, we have socket plus potentially socket for neon walreader */
+#if PG_MAJORVERSION_NUM >= 17
+	waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers);
+#else
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
+#endif
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c
index f327e132e9..66032c88f6 100644
--- a/pgxn/neon_rmgr/neon_rmgr_decode.c
+++ b/pgxn/neon_rmgr/neon_rmgr_decode.c
@@ -1,6 +1,7 @@
 #include "postgres.h"
 
 #if PG_MAJORVERSION_NUM >= 16
+
 #include "access/heapam_xlog.h"
 #include "access/neon_xlog.h"
 #include "replication/decode.h"
@@ -9,6 +10,10 @@
 
 #include "neon_rmgr.h"
 
+#endif /* PG >= 16 */
+
+#if PG_MAJORVERSION_NUM == 16
+
 /* individual record(group)'s handlers */
 static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
 static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
@@ -399,6 +404,398 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
 	header->t_infomask2 = xlhdr.t_infomask2;
 	header->t_hoff = xlhdr.t_hoff;
 }
+#endif
+
+#if PG_MAJORVERSION_NUM == 17
+
+/* individual record(group)'s handlers */
+static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+
+/* common function to decode tuples */
+static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple);
 
 
-#endif
\ No newline at end of file
+void
+neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8		info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK;
+	TransactionId xid = XLogRecGetXid(buf->record);
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
+
+	/*
+	 * If we don't have snapshot or we are just fast-forwarding, there is no
+	 * point in decoding data changes.
+	 */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
+		ctx->fast_forward)
+		return;
+
+	switch (info)
+	{
+		case XLOG_NEON_HEAP_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonInsert(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_DELETE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonDelete(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_UPDATE:
+		case XLOG_NEON_HEAP_HOT_UPDATE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonUpdate(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_LOCK:
+			break;
+		case XLOG_NEON_HEAP_MULTI_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonMultiInsert(ctx, buf);
+			break;
+		default:
+			elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
+			break;
+	}
+}
+
+static inline bool
+FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id)
+{
+	if (ctx->callbacks.filter_by_origin_cb == NULL)
+		return false;
+
+	return filter_by_origin_cb_wrapper(ctx, origin_id);
+}
+
+/*
+ * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
+ *
+ * Deletes can contain the new tuple.
+ */
+static void
+DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	Size		datalen;
+	char	   *tupledata;
+	Size		tuplelen;
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_insert *xlrec;
+	ReorderBufferChange *change;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_insert *) XLogRecGetData(r);
+
+	/*
+	 * Ignore insert records without new tuples (this does happen when
+	 * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL).
+	 */
+	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
+		return;
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE))
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+	else
+		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT;
+	change->origin_id = XLogRecGetOrigin(r);
+
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	tupledata = XLogRecGetBlockData(r, 0, &datalen);
+	tuplelen = datalen - SizeOfHeapHeader;
+
+	change->data.tp.newtuple =
+		ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+	DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple);
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change,
+							 xlrec->flags & XLH_INSERT_ON_TOAST_RELATION);
+}
+
+/*
+ * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
+ *
+ * Deletes can possibly contain the old primary key.
+ */
+static void
+DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_delete *xlrec;
+	ReorderBufferChange *change;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_delete *) XLogRecGetData(r);
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+
+	if (xlrec->flags & XLH_DELETE_IS_SUPER)
+		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT;
+	else
+		change->action = REORDER_BUFFER_CHANGE_DELETE;
+
+	change->origin_id = XLogRecGetOrigin(r);
+
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	/* old primary key stored */
+	if (xlrec->flags & XLH_DELETE_CONTAINS_OLD)
+	{
+		Size		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader;
+		Size		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader));
+
+		change->data.tp.oldtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete,
+						datalen, change->data.tp.oldtuple);
+	}
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
+}
+
+/*
+ * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
+ * in the record, from wal into proper tuplebufs.
+ *
+ * Updates can possibly contain a new tuple and the old primary key.
+ */
+static void
+DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_update *xlrec;
+	ReorderBufferChange *change;
+	char	   *data;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_update *) XLogRecGetData(r);
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_UPDATE;
+	change->origin_id = XLogRecGetOrigin(r);
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE)
+	{
+		Size		datalen;
+		Size		tuplelen;
+
+		data = XLogRecGetBlockData(r, 0, &datalen);
+
+		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		change->data.tp.newtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple(data, datalen, change->data.tp.newtuple);
+	}
+
+	if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD)
+	{
+		Size		datalen;
+		Size		tuplelen;
+
+		/* caution, remaining data in record is not aligned */
+		data = XLogRecGetData(r) + SizeOfNeonHeapUpdate;
+		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate;
+		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		change->data.tp.oldtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple(data, datalen, change->data.tp.oldtuple);
+	}
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
+}
+
+/*
+ * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
+ *
+ * Currently MULTI_INSERT will always contain the full tuples.
+ */
+static void
+DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_multi_insert *xlrec;
+	int			i;
+	char	   *data;
+	char	   *tupledata;
+	Size		tuplelen;
+	RelFileLocator rlocator;
+
+	xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r);
+
+	/*
+	 * Ignore insert records without new tuples.  This happens when a
+	 * multi_insert is done on a catalog or on a non-persistent relation.
+	 */
+	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
+		return;
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL);
+	if (rlocator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	/*
+	 * We know that this multi_insert isn't for a catalog, so the block should
+	 * always have data even if a full-page write of it is taken.
+	 */
+	tupledata = XLogRecGetBlockData(r, 0, &tuplelen);
+	Assert(tupledata != NULL);
+
+	data = tupledata;
+	for (i = 0; i < xlrec->ntuples; i++)
+	{
+		ReorderBufferChange *change;
+		xl_neon_multi_insert_tuple *xlhdr;
+		int			datalen;
+		HeapTuple	tuple;
+		HeapTupleHeader header;
+
+		change = ReorderBufferGetChange(ctx->reorder);
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+		change->origin_id = XLogRecGetOrigin(r);
+
+		memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator));
+
+		xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data);
+		data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple;
+		datalen = xlhdr->datalen;
+
+		change->data.tp.newtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, datalen);
+
+		tuple = change->data.tp.newtuple;
+		header = tuple->t_data;
+
+		/* not a disk based tuple */
+		ItemPointerSetInvalid(&tuple->t_self);
+
+		/*
+		 * We can only figure this out after reassembling the transactions.
+		 */
+		tuple->t_tableOid = InvalidOid;
+
+		tuple->t_len = datalen + SizeofHeapTupleHeader;
+
+		memset(header, 0, SizeofHeapTupleHeader);
+
+		memcpy((char *) tuple->t_data + SizeofHeapTupleHeader,
+			   (char *) data,
+			   datalen);
+		header->t_infomask = xlhdr->t_infomask;
+		header->t_infomask2 = xlhdr->t_infomask2;
+		header->t_hoff = xlhdr->t_hoff;
+
+		/*
+		 * Reset toast reassembly state only after the last row in the last
+		 * xl_multi_insert_tuple record emitted by one heap_multi_insert()
+		 * call.
+		 */
+		if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI &&
+			(i + 1) == xlrec->ntuples)
+			change->data.tp.clear_toast_afterwards = true;
+		else
+			change->data.tp.clear_toast_afterwards = false;
+
+		ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
+								 buf->origptr, change, false);
+
+		/* move to the next xl_neon_multi_insert_tuple entry */
+		data += datalen;
+	}
+	Assert(data == tupledata + tuplelen);
+}
+
+/*
+ * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
+ * (but not by heap_multi_insert) into a tuplebuf.
+ *
+ * The size 'len' and the pointer 'data' in the record need to be
+ * computed outside as they are record specific.
+ */
+static void
+DecodeXLogTuple(char *data, Size len, HeapTuple tuple)
+{
+	xl_neon_heap_header xlhdr;
+	int			datalen = len - SizeOfNeonHeapHeader;
+	HeapTupleHeader header;
+
+	Assert(datalen >= 0);
+
+	tuple->t_len = datalen + SizeofHeapTupleHeader;
+	header = tuple->t_data;
+
+	/* not a disk based tuple */
+	ItemPointerSetInvalid(&tuple->t_self);
+
+	/* we can only figure this out after reassembling the transactions */
+	tuple->t_tableOid = InvalidOid;
+
+	/* data is not stored aligned, copy to aligned storage */
+	memcpy((char *) &xlhdr,
+		   data,
+		   SizeOfNeonHeapHeader);
+
+	memset(header, 0, SizeofHeapTupleHeader);
+
+	memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader,
+		   data + SizeOfNeonHeapHeader,
+		   datalen);
+
+	header->t_infomask = xlhdr.t_infomask;
+	header->t_infomask2 = xlhdr.t_infomask2;
+	header->t_hoff = xlhdr.t_hoff;
+}
+#endif
diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c
index 4e604a710c..a45e8f5c4a 100644
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -68,8 +68,13 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum);
 static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
 static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo);
+#if PG_MAJORVERSION_NUM >= 17
+static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum, int nblocks);
+#else
 static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum);
+#endif
 #if PG_MAJORVERSION_NUM < 16
 static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, char *buffer, bool skipFsync);
@@ -93,7 +98,9 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
 static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber nblocks);
 static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
-
+#if PG_MAJORVERSION_NUM >= 17
+static void inmem_registersync(SMgrRelation reln, ForkNumber forknum);
+#endif
 
 /*
  *	inmem_init() -- Initialize private state
@@ -190,6 +197,14 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
 {
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static bool
+inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			   int nblocks)
+{
+	return true;
+}
+#else
 /*
  *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
  */
@@ -198,6 +213,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 	return true;
 }
+#endif
 
 /*
  * inmem_writeback() -- Tell the kernel to write pages back to storage.
@@ -211,11 +227,13 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 /*
  *	inmem_read() -- Read the specified block from a relation.
  */
+#if PG_MAJORVERSION_NUM < 16
 static void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
-#if PG_MAJORVERSION_NUM < 16
 		   char *buffer)
 #else
+static void
+inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   void *buffer)
 #endif
 {
@@ -228,6 +246,18 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		memcpy(buffer, page_body[pg], BLCKSZ);
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			void **buffers, BlockNumber nblocks)
+{
+	for (int i = 0; i < nblocks; i++)
+	{
+		inmem_read(reln, forknum, blkno, buffers[i]);
+	}
+}
+#endif
+
 /*
  *	inmem_write() -- Write the supplied block at the appropriate location.
  *
@@ -280,6 +310,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	memcpy(page_body[pg], buffer, BLCKSZ);
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	for (int i = 0; i < nblocks; i++)
+	{
+		inmem_write(reln, forknum, blkno, buffers[i], skipFsync);
+	}
+}
+#endif
+
 /*
  *	inmem_nblocks() -- Get the number of blocks stored in a relation.
  */
@@ -315,6 +357,13 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }
 
+#if PG_MAJORVERSION_NUM >= 17
+static void
+inmem_registersync(SMgrRelation reln, ForkNumber forknum)
+{
+}
+#endif
+
 static const struct f_smgr inmem_smgr =
 {
 	.smgr_init = inmem_init,
@@ -328,23 +377,39 @@ static const struct f_smgr inmem_smgr =
 #if PG_MAJORVERSION_NUM >= 16
 	.smgr_zeroextend = inmem_zeroextend,
 #endif
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_prefetch = inmem_prefetch,
+	.smgr_readv = inmem_readv,
+	.smgr_writev = inmem_writev,
+#else
 	.smgr_prefetch = inmem_prefetch,
 	.smgr_read = inmem_read,
 	.smgr_write = inmem_write,
+#endif
 	.smgr_writeback = inmem_writeback,
 	.smgr_nblocks = inmem_nblocks,
 	.smgr_truncate = inmem_truncate,
 	.smgr_immedsync = inmem_immedsync,
+
+#if PG_MAJORVERSION_NUM >= 17
+	.smgr_registersync = inmem_registersync,
+#endif
+
+	.smgr_start_unlogged_build = NULL,
+	.smgr_finish_unlogged_build_phase_1 = NULL,
+	.smgr_end_unlogged_build = NULL,
+	.smgr_read_slru_segment = NULL,
 };
 
 const f_smgr *
-smgr_inmem(BackendId backend, NRelFileInfo rinfo)
+smgr_inmem(ProcNumber backend, NRelFileInfo rinfo)
 {
 	Assert(InRecovery);
-	if (backend != InvalidBackendId)
-		return smgr_standard(backend, rinfo);
-	else
-		return &inmem_smgr;
+	// // What does this code do?
+	// if (backend != INVALID_PROC_NUMBER)
+	// 	return smgr_standard(backend, rinfo);
+	// else
+	return &inmem_smgr;
 }
 
 void
diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h
index 58b98b8e6a..91f1c80965 100644
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -11,7 +11,7 @@
 #ifndef INMEM_SMGR_H
 #define INMEM_SMGR_H
 
-extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo);
+extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_inmem(void);
 
 #endif /* INMEM_SMGR_H */
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index cc545393f5..219ca85207 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -100,6 +100,9 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
+#if PG_MAJORVERSION_NUM >= 17
+#include "storage/dsm_registry.h"
+#endif
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
@@ -137,7 +140,7 @@ static BufferTag target_redo_tag;
 
 static XLogReaderState *reader_state;
 
-#define TRACE DEBUG5
+#define TRACE LOG
 
 #ifdef HAVE_LIBSECCOMP
 
@@ -517,6 +520,10 @@ CreateFakeSharedMemoryAndSemaphores()
 	/*
 	 * Set up xlog, clog, and buffers
 	 */
+#if PG_MAJORVERSION_NUM >= 17
+	DSMRegistryShmemInit();
+	VarsupShmemInit();
+#endif
 	XLOGShmemInit();
 	CLOGShmemInit();
 	CommitTsShmemInit();
@@ -566,7 +573,10 @@ CreateFakeSharedMemoryAndSemaphores()
 	/*
 	 * Set up other modules that need some shared memory space
 	 */
+#if PG_MAJORVERSION_NUM < 17
+	/* "snapshot too old" was removed in PG17, and with it the SnapMgr */
 	SnapMgrInit();
+#endif
 	BTreeShmemInit();
 	SyncScanShmemInit();
 	/* Skip due to the 'pg_notify' directory check */
@@ -742,7 +752,7 @@ BeginRedoForBlock(StringInfo input_message)
 		 target_redo_tag.forkNum,
 		 target_redo_tag.blockNum);
 
-	reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT);
+	reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
 		reln->smgr_cached_nblocks[forknum] < blknum + 1)
 	{
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 064a678c96..d8390138c9 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -13,7 +13,7 @@ DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024
 class Lsn:
     """
     Datatype for an LSN. Internally it is a 64-bit integer, but the string
-    representation is like "1/123abcd". See also pg_lsn datatype in Postgres
+    representation is like "1/0123abcd". See also pg_lsn datatype in Postgres
     """
 
     def __init__(self, x: Union[int, str]):
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ee62372871..50284a3f5a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -933,8 +933,11 @@ class NeonEnvBuilder:
 
         for directory_to_clean in reversed(directories_to_clean):
             if not os.listdir(directory_to_clean):
-                log.debug(f"Removing empty directory {directory_to_clean}")
-                directory_to_clean.rmdir()
+                log.info(f"Removing empty directory {directory_to_clean}")
+                try:
+                    directory_to_clean.rmdir()
+                except Exception as e:
+                    log.error(f"Error removing empty directory {directory_to_clean}: {e}")
 
     def cleanup_remote_storage(self):
         for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]:
@@ -3423,6 +3426,7 @@ class VanillaPostgres(PgProtocol):
         assert not self.running
         with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
             conf_file.write("\n".join(options))
+            conf_file.write("\n")
 
     def edit_hba(self, hba: List[str]):
         """Prepend hba lines into pg_hba.conf file."""
@@ -3476,6 +3480,7 @@ def vanilla_pg(
     pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
     port = port_distributor.get_port()
     with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg:
+        vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
         yield vanilla_pg
 
 
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index e12c8e5f4a..258935959b 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -16,6 +16,7 @@ class PgVersion(str, enum.Enum):
     V14 = "14"
     V15 = "15"
     V16 = "16"
+    V17 = "17"
     # Instead of making version an optional parameter in methods, we can use this fake entry
     # to explicitly rely on the default server version (could be different from pg_version fixture value)
     NOT_SET = "<-POSTRGRES VERSION IS NOT SET->"
diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json
new file mode 100644
index 0000000000..7990b2c3a2
--- /dev/null
+++ b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json
@@ -0,0 +1,7 @@
+{
+    "public_extensions": [],
+    "library_index": {
+        "TODO": "We still need PG17 extensions"
+    },
+    "extension_data": {}
+}
\ No newline at end of file
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index b559be5f18..fb5c1d3115 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.workload import Workload
 
@@ -156,6 +156,9 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
+@skip_on_postgres(
+    PgVersion.V17, "There are no snapshots yet"
+)  # TODO: revert this once we have snapshots
 def test_backward_compatibility(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -203,6 +206,9 @@ def test_backward_compatibility(
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
+@skip_on_postgres(
+    PgVersion.V17, "There are no snapshots yet"
+)  # TODO: revert this once we have snapshots
 def test_forward_compatibility(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 27eb05ac09..7370eb1456 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -44,6 +44,8 @@ def test_remote_extensions(
 ):
     if pg_version == PgVersion.V16:
         pytest.skip("TODO: PG16 extension building")
+    if pg_version == PgVersion.V17:
+        pytest.skip("TODO: PG17 extension building")
 
     # setup mock http server
     # that expects request for anon.tar.zst
diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py
index 03e8c7c0df..4145a303c6 100644
--- a/test_runner/regress/test_postgres_version.py
+++ b/test_runner/regress/test_postgres_version.py
@@ -20,16 +20,19 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
         output = f.read().strip()
 
     # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)".
-    pattern = r"postgres \(PostgreSQL\) (?P<version>\d+\.\d+) \((?P<commit>[0-9a-f]{40})\)"
+    # beta- and release candidate releases would use '17beta1' and '18rc2' instead of .-separated numbers.
+    pattern = (
+        r"postgres \(PostgreSQL\) (?P<version>\d+(?:beta|rc|\.)\d+) \((?P<commit>[0-9a-f]{40})\)"
+    )
     match = re.search(pattern, output, re.IGNORECASE)
     assert match is not None, f"Can't parse {output} with {pattern}"
 
     version = match.group("version")
     commit = match.group("commit")
 
-    assert (
-        pg_version.v_prefixed in expected_revisions
-    ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
-
-    msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
-    assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
+    if "." in version:
+        assert (
+            pg_version.v_prefixed in expected_revisions
+        ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
+        msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
+        assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index d152d0f41f..f98b53d966 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -118,6 +118,9 @@ def test_ancestor_detach_branched_from(
         truncated_layers = 0
     elif branchpoint == Branchpoint.AFTER_L0:
         branch_at = Lsn(last_lsn + 8)
+        # make sure the branch point is not on a page header
+        if 0 < (branch_at.lsn_int % 8192) < 40:
+            branch_at += 40
         rows = 8192
         # as there is no 8 byte walrecord, nothing should get copied from the straddling layer
         truncated_layers = 0
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index ea900b07b8..ebe65e7c29 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -1,19 +1,32 @@
 import os
+from pathlib import Path
 
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    PgBin,
+    fork_at_current_lsn,
+    import_timeline_from_vanilla_postgres,
+)
 
 
 #
 # Test branching, when a transaction is in prepared state
 #
-def test_twophase(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"])
+def twophase_test_on_timeline(env: NeonEnv):
+    endpoint = env.endpoints.create_start(
+        "test_twophase", config_lines=["max_prepared_transactions=5"]
+    )
 
     conn = endpoint.connect()
     cur = conn.cursor()
 
+    # FIXME: Switch to the next WAL segment, to work around the bug fixed in
+    # https://github.com/neondatabase/neon/pull/8914.  When that is merged, this can be
+    # removed.
+    cur.execute("select pg_switch_wal()")
+
     cur.execute("CREATE TABLE foo (t text)")
 
     # Prepare a transaction that will insert a row
@@ -53,7 +66,7 @@ def test_twophase(neon_simple_env: NeonEnv):
     assert len(twophase_files) == 2
 
     # Create a branch with the transaction in prepared state
-    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main")
+    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase")
 
     # Start compute on the new branch
     endpoint2 = env.endpoints.create_start(
@@ -80,3 +93,50 @@ def test_twophase(neon_simple_env: NeonEnv):
     # Only one committed insert is visible on the original branch
     cur.execute("SELECT * FROM foo")
     assert cur.fetchall() == [("three",)]
+
+
+def test_twophase(neon_simple_env: NeonEnv):
+    """
+    Test branching, when a transaction is in prepared state
+    """
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_twophase")
+
+    twophase_test_on_timeline(env)
+
+
+def test_twophase_nonzero_epoch(
+    neon_simple_env: NeonEnv,
+    test_output_dir: Path,
+    pg_bin: PgBin,
+    vanilla_pg,
+):
+    """
+    Same as 'test_twophase' test, but with a non-zero XID epoch, i.e. after 4 billion XIDs
+    have been consumed. (This is to ensure that we correctly use the full 64-bit XIDs in
+    pg_twophase filenames with PostgreSQL v17.)
+    """
+    env = neon_simple_env
+
+    # Reset the vanilla Postgres instance with a higher XID epoch
+    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
+    cmd = [pg_resetwal_path, "--epoch=1000000000", "-D", str(vanilla_pg.pgdatadir)]
+    pg_bin.run_capture(cmd)
+
+    timeline_id = TimelineId.generate()
+
+    # Import the cluster to Neon
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+    import_timeline_from_vanilla_postgres(
+        test_output_dir,
+        env,
+        pg_bin,
+        env.initial_tenant,
+        timeline_id,
+        "test_twophase",
+        vanilla_pg.connstr(),
+    )
+    vanilla_pg.stop()  # don't need the original server anymore
+
+    twophase_test_on_timeline(env)
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
new file mode 160000
index 0000000000..9156d63ce2
--- /dev/null
+++ b/vendor/postgres-v17
@@ -0,0 +1 @@
+Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa

From 0a8c5e1214fcd3f59767a6ca4adeb68612977e51 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 13 Sep 2024 15:10:52 +0100
Subject: [PATCH 055/142] Fix broken image for PG17 (#8998)

Most extensions are not required to run Neon-based PostgreSQL, but the
Neon extension is _quite_ critical, so let's make sure we include it.

## Problem

Staging doesn't have working compute images for PG17

## Summary of changes

Disable some PG17 filters so that we get the critical components into the PG17 image
---
 Dockerfile.compute-node | 63 ++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index fe902eb978..6e2510fe60 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -81,10 +81,7 @@ RUN cd postgres && \
 FROM build-deps AS postgis-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt update && \
+RUN apt update && \
     apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
     libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
@@ -92,8 +89,8 @@ RUN case "${PG_VERSION}" in "v17") \
 
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
 RUN case "${PG_VERSION}" in "v17") \
-        mkdir -p /sfcgal && \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    mkdir -p /sfcgal && \
+    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
     esac && \
     wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
     echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
@@ -105,7 +102,7 @@ RUN case "${PG_VERSION}" in "v17") \
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
     esac && \
     wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -666,7 +663,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \
     esac && \
     wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
@@ -687,7 +684,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \
     esac && \
     wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
@@ -707,10 +704,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
         export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
@@ -736,7 +730,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
@@ -769,7 +763,7 @@ USER nonroot
 WORKDIR /home/nonroot
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
     esac && \
     curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
@@ -791,7 +785,7 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
     esac && \
     wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
     echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
@@ -816,7 +810,7 @@ FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
     esac && \
     wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
     echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
@@ -839,7 +833,7 @@ ARG PG_VERSION
 
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
     esac && \
     wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
@@ -861,7 +855,7 @@ FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
     esac && \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
     echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
@@ -883,7 +877,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \
     esac && \
     wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
@@ -903,7 +897,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \
     esac && \
     wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
@@ -924,7 +918,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    echo "pg_partman doesn't support PG17 yet" && exit 0;; \
     esac && \
     wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
@@ -977,10 +971,7 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon \
         -s install && \
@@ -1023,10 +1014,7 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
 
 #########################################################################################
 #
@@ -1047,24 +1035,15 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
+RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
 
 # Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    rm -r /usr/local/pgsql/include
+RUN rm -r /usr/local/pgsql/include
 
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    rm /usr/local/pgsql/lib/lib*.a
+RUN rm /usr/local/pgsql/lib/lib*.a
 
 
 #########################################################################################

From b2c83db54d58d46e8ca11d5f1b4a38471322f713 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 16 Sep 2024 12:44:26 +0100
Subject: [PATCH 056/142] CI(gather-rust-build-stats): set PQ_LIB_DIR to
 Postgres 17 (#9001)

## Problem

`gather-rust-build-stats` extra CI job fails with
```
"PQ_LIB_DIR" doesn't exist in the configured path: "/__w/neon/neon/pg_install/v16/lib"
```

## Summary of changes
- Use the path to Postgres 17 for the `gather-rust-build-stats` job.
The job uses Postgres built by `make walproposer-lib`
---
 .github/workflows/neon_extra_builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 41c9f5dee5..140aac032a 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -181,7 +181,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats

From 5876c441abc973acca60882192ad46333c075abd Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 10 Sep 2024 16:46:32 +0100
Subject: [PATCH 057/142] Grant access to pg_show_replication_origin_status for
 neon_superuser

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 ...ant_pg_show_replication_origin_status_to_neon_superuser.sql | 1 +
 compute_tools/src/spec.rs                                      | 3 +++
 test_runner/regress/test_migrations.py                         | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql

diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
new file mode 100644
index 0000000000..425ed8cd3d
--- /dev/null
+++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 6a87263821..aa9405d28d 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -793,6 +793,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!(
             "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
         ),
+        include_str!(
+            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
+        ),
     ];
 
     MigrationRunner::new(client, &migrations).run_migrations()?;
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index e88e56d030..7211619a99 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -14,7 +14,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
-    num_migrations = 10
+    num_migrations = 11
     endpoint.wait_for_migrations(num_migrations=num_migrations)
 
     with endpoint.cursor() as cur:

From c8bedca5821694633d140a1da2baa4fa9c7dea0c Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 16 Sep 2024 18:06:31 +0200
Subject: [PATCH 058/142] Fix PG17's extension modifications (#9010)

This also reduces the GRANT statements to one per created _reset
function
---
 Dockerfile.compute-node | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 6e2510fe60..6bf6fb650f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -55,22 +55,27 @@ RUN cd postgres && \
     # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
     # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
     # so we do it here.
-    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
-    # the first loop is for pg_stat_statement extension version <= 1.6
     for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
         filename=$(basename "$file"); \
-        if echo "$old_list" | grep -q -F "$filename"; then \
+        # Note that there are no downgrade scripts for pg_stat_statements, so we \
+        # don't have to modify any downgrade paths or (much) older versions: we only \
+        # have to make sure every creation of the pg_stat_statements_reset function \
+        # also adds execute permissions to the neon_superuser.
+        case $filename in \
+          pg_stat_statements--1.4.sql) \
+            # pg_stat_statements_reset is first created with 1.4
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-        fi; \
-    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7,
-    # where pg_stat_statement_reset() got 3 additional arguments
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if ! echo "$old_list" | grep -q -F "$filename"; then \
+            ;; \
+          pg_stat_statements--1.6--1.7.sql) \
+            # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-        fi; \
-    done
+            ;; \
+          pg_stat_statements--1.10--1.11.sql) \
+            # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
+            ;; \
+        esac; \
+    done;
 
 #########################################################################################
 #

From 2bbb4d3e1c4d70d5bcdc972eb2b9863d1073338a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 16 Sep 2024 21:45:19 +0300
Subject: [PATCH 059/142] Remove misc unused code (#9014)

---
 libs/postgres_ffi/src/xlog_utils.rs |  9 +-------
 libs/utils/src/accum.rs             | 33 -----------------------------
 libs/utils/src/id.rs                | 10 ---------
 libs/utils/src/lib.rs               |  7 ------
 libs/utils/src/lsn.rs               |  9 --------
 libs/utils/src/nonblock.rs          | 17 ---------------
 libs/utils/src/shutdown.rs          |  7 ------
 7 files changed, 1 insertion(+), 91 deletions(-)
 delete mode 100644 libs/utils/src/accum.rs
 delete mode 100644 libs/utils/src/nonblock.rs
 delete mode 100644 libs/utils/src/shutdown.rs

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 0cfd56962e..1873734753 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -30,7 +30,7 @@ use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
 use std::io::SeekFrom;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
@@ -260,13 +260,6 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
     }
 }
 
-pub fn main() {
-    let mut data_dir = PathBuf::new();
-    data_dir.push(".");
-    let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
-    println!("wal_end={:?}", wal_end);
-}
-
 impl XLogRecord {
     pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
         use utils::bin_ser::LeSer;
diff --git a/libs/utils/src/accum.rs b/libs/utils/src/accum.rs
deleted file mode 100644
index 0fb0190a92..0000000000
--- a/libs/utils/src/accum.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
-/// feed the accumulated values by calling the 'accum' function, instead of having an
-/// iterator.
-///
-/// For example, to calculate the smallest value among some integers:
-///
-/// ```
-/// use utils::accum::Accum;
-///
-/// let values = [1, 2, 3];
-///
-/// let mut min_value: Accum<u32> = Accum(None);
-/// for new_value in &values {
-///     min_value.accum(std::cmp::min, *new_value);
-/// }
-///
-/// assert_eq!(min_value.0.unwrap(), 1);
-/// ```
-pub struct Accum<T>(pub Option<T>);
-impl<T: Copy> Accum<T> {
-    pub fn accum<F>(&mut self, func: F, new_value: T)
-    where
-        F: FnOnce(T, T) -> T,
-    {
-        // If there is no previous value, just store the new value.
-        // Otherwise call the function to decide which one to keep.
-        self.0 = Some(if let Some(accum) = self.0 {
-            func(accum, new_value)
-        } else {
-            new_value
-        });
-    }
-}
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index 2cda899b15..eb91839504 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -88,12 +88,6 @@ impl<'de> Deserialize<'de> for Id {
 }
 
 impl Id {
-    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
-        let mut arr = [0u8; 16];
-        buf.copy_to_slice(&mut arr);
-        Id::from(arr)
-    }
-
     pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
         if src.len() != 16 {
             return Err(IdError::SliceParseError(src.len()));
@@ -179,10 +173,6 @@ impl fmt::Debug for Id {
 macro_rules! id_newtype {
     ($t:ident) => {
         impl $t {
-            pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
-                $t(Id::get_from_buf(buf))
-            }
-
             pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
                 Ok($t(Id::from_slice(src)?))
             }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 218dd468b1..03fb36caf8 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -43,16 +43,9 @@ pub mod logging;
 pub mod lock_file;
 pub mod pid_file;
 
-// Misc
-pub mod accum;
-pub mod shutdown;
-
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;
 
-// Utility for putting a raw file descriptor into non-blocking mode
-pub mod nonblock;
-
 // Default signal handling
 pub mod sentry_init;
 pub mod signals;
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 1aebe91428..06d5c27ebf 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,6 +1,5 @@
 #![warn(missing_docs)]
 
-use camino::Utf8Path;
 use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
@@ -145,14 +144,6 @@ impl Lsn {
         i128::from(self.0) - i128::from(other)
     }
 
-    /// Parse an LSN from a filename in the form `0000000000000000`
-    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
-    where
-        F: AsRef<Utf8Path>,
-    {
-        Lsn::from_hex(filename.as_ref().as_str())
-    }
-
     /// Parse an LSN from a string in the form `0000000000000000`
     pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
     where
diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs
deleted file mode 100644
index 05e2e3af4c..0000000000
--- a/libs/utils/src/nonblock.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
-use std::os::unix::io::RawFd;
-
-/// Put a file descriptor into non-blocking mode
-pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
-    let bits = fcntl(fd, F_GETFL)?;
-
-    // If F_GETFL returns some unknown bits, they should be valid
-    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
-    // would effectively clear them, which is not what we want.
-    let mut flags = OFlag::from_bits_retain(bits);
-    flags |= OFlag::O_NONBLOCK;
-
-    fcntl(fd, F_SETFL(flags))?;
-
-    Ok(())
-}
diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs
deleted file mode 100644
index cb5a44d664..0000000000
--- a/libs/utils/src/shutdown.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Immediately terminate the calling process without calling
-/// atexit callbacks, C runtime destructors etc. We mainly use
-/// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) -> ! {
-    // SAFETY: exiting is safe, the ffi is not safe
-    unsafe { nix::libc::_exit(code as _) };
-}

From 5e16c7bb0b484cc760f4ddb3c0a866157760825e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 10 Sep 2024 15:37:08 +0100
Subject: [PATCH 060/142] Generate pgbench data on the server for most tests

This should generally be faster when running tests, especially those
that run with higher scales.

Ignoring test_lfc_resize since it seems like we are hitting a query
timeout for some reason that I have yet to investigate. A little bit of
improvemnt is better than none.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/performance/test_branch_creation.py    |  2 +-
 test_runner/performance/test_branching.py          |  2 +-
 .../performance/test_logical_replication.py        | 14 +++++++-------
 .../performance/test_physical_replication.py       |  4 ++--
 test_runner/regress/test_branching.py              |  2 +-
 test_runner/regress/test_disk_usage_eviction.py    |  2 +-
 test_runner/regress/test_hot_standby.py            |  2 +-
 test_runner/regress/test_pageserver_reconnect.py   |  2 +-
 .../test_pageserver_restarts_under_workload.py     |  2 +-
 .../regress/test_threshold_based_eviction.py       |  2 +-
 10 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index b3866f1813..f1ab7876f9 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -107,7 +107,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     env.neon_cli.create_branch("b0")
 
     endpoint = env.endpoints.create_start("b0")
-    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
+    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()])
 
     branch_creation_durations = []
 
diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py
index 667d1a4c4a..f8d39487f2 100644
--- a/test_runner/performance/test_branching.py
+++ b/test_runner/performance/test_branching.py
@@ -43,7 +43,7 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
 
     env.neon_cli.create_branch("root")
     endpoint_root = env.endpoints.create_start("root")
-    pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", endpoint_root.connstr(), "-s10"])
 
     fork_at_current_lsn(env, endpoint_root, "child", "root")
 
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 29a0380524..dbf94a2cf5 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -24,13 +24,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
 
     endpoint = env.endpoints.create_start("main")
 
-    pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()])
 
     endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
 
     # now start subscriber
     vanilla_pg.start()
-    pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", vanilla_pg.connstr()])
 
     vanilla_pg.safe_psql("truncate table pgbench_accounts")
     vanilla_pg.safe_psql("truncate table pgbench_history")
@@ -99,9 +99,9 @@ def test_subscriber_lag(
     sub_connstr = benchmark_project_sub.connstr
 
     if benchmark_project_pub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env)
     if benchmark_project_sub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
@@ -193,8 +193,8 @@ def test_publisher_restart(
     pub_connstr = benchmark_project_pub.connstr
     sub_connstr = benchmark_project_sub.connstr
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
@@ -288,7 +288,7 @@ def test_snap_files(
             is_super = cur.fetchall()[0][0]
             assert is_super, "This benchmark won't work if we don't have superuser"
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)
 
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 7e16197211..49b1176d34 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -85,7 +85,7 @@ def test_ro_replica_lag(
             endpoint_id=replica["endpoint"]["id"],
         )["uri"]
 
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=master_env)
 
         master_workload = pg_bin.run_nonblocking(
             ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
@@ -212,7 +212,7 @@ def test_replication_start_stop(
         for i in range(num_replicas):
             replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
 
-        pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)
 
         # Sync replicas
         with psycopg2.connect(master_connstr) as conn_master:
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index fc74707639..1729e2fc98 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -52,7 +52,7 @@ def test_branching_with_pgbench(
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
 
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", "-T15", connstr])
 
     env.neon_cli.create_branch("b0", tenant_id=tenant)
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 85616c3fe2..1fec8b3f18 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -291,7 +291,7 @@ def pgbench_init_tenant(
     )
 
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", endpoint.connstr()])
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     return (tenant_id, timeline_id)
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index ae63136abb..d94704012f 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -199,7 +199,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
 def run_pgbench(connstr: str, pg_bin: PgBin):
     log.info(f"Start a pgbench workload on pg {connstr}")
     # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr])
     log.info("pgbench init done")
     pg_bin.run_capture(["pgbench", "-T60", connstr])
 
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
index 37ff923632..ada6da98ff 100644
--- a/test_runner/regress/test_pageserver_reconnect.py
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -22,7 +22,7 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py
index 65569f3bac..9bb9b373ad 100644
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -19,7 +19,7 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 840c7159ad..094dd20529 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -106,7 +106,7 @@ def test_threshold_based_eviction(
 
     # create a bunch of layers
     with env.endpoints.create_start("main", tenant_id=tenant_id) as pg:
-        pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()])
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s", "3", pg.connstr()])
         last_flush_lsn_upload(env, pg, tenant_id, timeline_id)
     # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint
     for sk in env.safekeepers:

From 3a52e356c18440cb3b685d2cf1848fd3a3c86c67 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:46:58 +0300
Subject: [PATCH 061/142] Remove unused function (#9018)

---
 test_runner/fixtures/neon_fixtures.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 50284a3f5a..5f7a32782c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -182,25 +182,6 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
     yield output_dir
 
 
-@pytest.fixture(scope="function")
-def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
-    versioned_dir = pg_distrib_dir / pg_version.v_prefixed
-
-    psql_bin_path = versioned_dir / "bin/psql"
-    postgres_bin_path = versioned_dir / "bin/postgres"
-
-    if os.getenv("REMOTE_ENV"):
-        # When testing against a remote server, we only need the client binary.
-        if not psql_bin_path.exists():
-            raise Exception(f"psql not found at '{psql_bin_path}'")
-    else:
-        if not postgres_bin_path.exists():
-            raise Exception(f"postgres not found at '{postgres_bin_path}'")
-
-    log.info(f"versioned_pg_distrib_dir is {versioned_dir}")
-    yield versioned_dir
-
-
 @pytest.fixture(scope="session")
 def neon_api_key() -> str:
     api_key = os.getenv("NEON_API_KEY")

From fec9321fc04434f84f86d166a6d89c4421110c77 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 02:23:49 +0300
Subject: [PATCH 062/142] Use Path type in a few more places in
 neon_fixtures.py (#9018)

This is in preparation of replacing neon_fixtures.get_dir_size with
neon_fixtures.utils.get_dir_size() in next commit.
---
 test_runner/fixtures/compare_fixtures.py |  5 ++--
 test_runner/fixtures/neon_fixtures.py    | 32 ++++++++++++------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 7c4a8db36f..770b32b11e 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
 from contextlib import _GeneratorContextManager, contextmanager
 
 # Type-related stuff
+from pathlib import Path
 from typing import Dict, Iterator, List
 
 import pytest
@@ -229,11 +230,11 @@ class VanillaCompare(PgCompare):
         pass  # TODO find something
 
     def report_size(self):
-        data_size = self.pg.get_subdir_size("base")
+        data_size = self.pg.get_subdir_size(Path("base"))
         self.zenbenchmark.record(
             "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
-        wal_size = self.pg.get_subdir_size("pg_wal")
+        wal_size = self.pg.get_subdir_size(Path("pg_wal"))
         self.zenbenchmark.record(
             "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5f7a32782c..90a351cdb3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -224,7 +224,7 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
     return BASE_PORT + worker_seq_no * worker_port_num
 
 
-def get_dir_size(path: str) -> int:
+def get_dir_size(path: Path) -> int:
     """Return size in bytes."""
     totalbytes = 0
     for root, _dirs, files in os.walk(path):
@@ -3319,12 +3319,12 @@ class PgBin:
         )
         return base_path
 
-    def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
+    def get_pg_controldata_checkpoint_lsn(self, pgdata: Path) -> Lsn:
         """
         Run pg_controldata on given datadir and extract checkpoint lsn.
         """
 
-        pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
+        pg_controldata_path = self.pg_bin_path / "pg_controldata"
         cmd = f"{pg_controldata_path} -D {pgdata}"
         result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
         checkpoint_lsn = re.findall(
@@ -3433,9 +3433,9 @@ class VanillaPostgres(PgProtocol):
         self.running = False
         self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
 
-    def get_subdir_size(self, subdir) -> int:
+    def get_subdir_size(self, subdir: Path) -> int:
         """Return size of pgdatadir subdirectory in bytes."""
-        return get_dir_size(os.path.join(self.pgdatadir, subdir))
+        return get_dir_size(self.pgdatadir / subdir)
 
     def __enter__(self) -> "VanillaPostgres":
         return self
@@ -3962,7 +3962,7 @@ class Endpoint(PgProtocol, LogUtils):
         self.env = env
         self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
-        self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
+        self.pgdata_dir: Optional[Path] = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
         self.pg_port = pg_port
         self.http_port = http_port
@@ -4019,7 +4019,7 @@ class Endpoint(PgProtocol, LogUtils):
             allow_multiple=allow_multiple,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
-        self.pgdata_dir = os.path.join(self.env.repo_dir, path)
+        self.pgdata_dir = self.env.repo_dir / path
         self.logfile = self.endpoint_path() / "compute.log"
 
         config_lines = config_lines or []
@@ -4072,21 +4072,21 @@ class Endpoint(PgProtocol, LogUtils):
         path = Path("endpoints") / self.endpoint_id
         return self.env.repo_dir / path
 
-    def pg_data_dir_path(self) -> str:
+    def pg_data_dir_path(self) -> Path:
         """Path to Postgres data directory"""
-        return os.path.join(self.endpoint_path(), "pgdata")
+        return self.endpoint_path() / "pgdata"
 
-    def pg_xact_dir_path(self) -> str:
+    def pg_xact_dir_path(self) -> Path:
         """Path to pg_xact dir"""
-        return os.path.join(self.pg_data_dir_path(), "pg_xact")
+        return self.pg_data_dir_path() / "pg_xact"
 
-    def pg_twophase_dir_path(self) -> str:
+    def pg_twophase_dir_path(self) -> Path:
         """Path to pg_twophase dir"""
-        return os.path.join(self.pg_data_dir_path(), "pg_twophase")
+        return self.pg_data_dir_path() / "pg_twophase"
 
-    def config_file_path(self) -> str:
+    def config_file_path(self) -> Path:
         """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
-        return os.path.join(self.endpoint_path(), "postgresql.conf")
+        return self.endpoint_path() / "postgresql.conf"
 
     def config(self, lines: List[str]) -> "Endpoint":
         """
@@ -4251,7 +4251,7 @@ class Endpoint(PgProtocol, LogUtils):
         log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
         self.safe_psql("checkpoint")
         assert self.pgdata_dir is not None  # please mypy
-        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
+        return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
 
     def clear_shared_buffers(self, cursor: Optional[Any] = None):
         """

From c6f56b8462e16284b77a3176801c9ed7364df04b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:15 +0300
Subject: [PATCH 063/142] Remove redundant get_dir_size() function (#9018)

There was another copy of it in utils.py. The only difference is that
the version in utils.py tolerates files that are concurrently
removed. That seems fine for the few callers in neon_fixtures.py too.
---
 test_runner/fixtures/neon_fixtures.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 90a351cdb3..92dcd1e3cd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -93,6 +93,7 @@ from fixtures.utils import (
     allure_add_grafana_links,
     allure_attach_from_dir,
     assert_no_errors,
+    get_dir_size,
     get_self_dir,
     print_gc_result,
     subprocess_capture,
@@ -224,16 +225,6 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
     return BASE_PORT + worker_seq_no * worker_port_num
 
 
-def get_dir_size(path: Path) -> int:
-    """Return size in bytes."""
-    totalbytes = 0
-    for root, _dirs, files in os.walk(path):
-        for name in files:
-            totalbytes += os.path.getsize(os.path.join(root, name))
-
-    return totalbytes
-
-
 @pytest.fixture(scope="session")
 def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)

From 4295ff0f071789c22d0a838f5512e5caba57d54a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:21 +0300
Subject: [PATCH 064/142] Mark a couple of test fixtures as session-scoped
 (#9018)

pg_distrib_dir doesn't include the Postgres version and only depends
on env variables which cannot change during a test run, so it can be
marked as session-scoped. Similarly, the platform cannot change during
a test run.
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 test_runner/fixtures/parametrize.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 92dcd1e3cd..d2caee5992 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -159,7 +159,7 @@ def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]:
     yield binpath
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="session")
 def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
     if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"):
         distrib_dir = Path(env_postgres_bin).resolve()
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index e2dd51802c..2c8e71526c 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -24,7 +24,7 @@ def build_type() -> Optional[str]:
     return None
 
 
-@pytest.fixture(scope="function", autouse=True)
+@pytest.fixture(scope="session", autouse=True)
 def platform() -> Optional[str]:
     return None
 

From 2db840d8b8736530a9653719ed2560e3647539a0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:26 +0300
Subject: [PATCH 065/142] Move a few test functions related to auth tokens to
 separate file (#9018)

For readability. neon_fixtures.py is huge.
---
 test_runner/fixtures/auth_tokens.py           | 47 +++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         | 40 +---------------
 .../regress/test_storage_controller.py        |  2 +-
 3 files changed, 49 insertions(+), 40 deletions(-)
 create mode 100644 test_runner/fixtures/auth_tokens.py

diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py
new file mode 100644
index 0000000000..8ebaf61e5e
--- /dev/null
+++ b/test_runner/fixtures/auth_tokens.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+
+import jwt
+
+from fixtures.common_types import TenantId
+
+
+@dataclass
+class AuthKeys:
+    priv: str
+
+    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
+        token_data = {key: str(val) for key, val in token_data.items()}
+        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
+        # cast(Any, self.priv)
+
+        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
+        # hinting or something (not sure what). If it returned 'bytes', convert it to 'str'
+        # explicitly.
+        if isinstance(token, bytes):
+            token = token.decode()
+
+        return token
+
+    def generate_pageserver_token(self) -> str:
+        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
+
+    def generate_safekeeper_token(self) -> str:
+        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
+
+    # generate token giving access to only one tenant
+    def generate_tenant_token(self, tenant_id: TenantId) -> str:
+        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TokenScope(str, Enum):
+    ADMIN = "admin"
+    PAGE_SERVER_API = "pageserverapi"
+    GENERATIONS_API = "generations_api"
+    SAFEKEEPER_DATA = "safekeeperdata"
+    TENANT = "tenant"
+    SCRUBBER = "scrubber"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d2caee5992..93b93ff019 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -43,7 +43,6 @@ from urllib.parse import quote, urlparse
 import asyncpg
 import backoff
 import httpx
-import jwt
 import psycopg2
 import psycopg2.sql
 import pytest
@@ -60,6 +59,7 @@ from psycopg2.extensions import make_dsn, parse_dsn
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
+from fixtures.auth_tokens import AuthKeys, TokenScope
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
@@ -373,44 +373,6 @@ class PgProtocol:
         return self.safe_psql(query, log_query=log_query)[0][0]
 
 
-@dataclass
-class AuthKeys:
-    priv: str
-
-    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
-        token_data = {key: str(val) for key, val in token_data.items()}
-        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
-        # cast(Any, self.priv)
-
-        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
-        # hinting or something (not sure what). If it returned 'bytes', convert it to 'str'
-        # explicitly.
-        if isinstance(token, bytes):
-            token = token.decode()
-
-        return token
-
-    def generate_pageserver_token(self) -> str:
-        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
-
-    def generate_safekeeper_token(self) -> str:
-        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
-
-    # generate token giving access to only one tenant
-    def generate_tenant_token(self, tenant_id: TenantId) -> str:
-        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
-
-
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class TokenScope(str, Enum):
-    ADMIN = "admin"
-    PAGE_SERVER_API = "pageserverapi"
-    GENERATIONS_API = "generations_api"
-    SAFEKEEPER_DATA = "safekeeperdata"
-    TENANT = "tenant"
-    SCRUBBER = "scrubber"
-
-
 class NeonEnvBuilder:
     """
     Builder object to create a Neon runtime environment
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index eea05d7548..2e21f8fb46 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -7,6 +7,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import pytest
+from fixtures.auth_tokens import TokenScope
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
@@ -18,7 +19,6 @@ from fixtures.neon_fixtures import (
     PgBin,
     StorageControllerApiException,
     StorageControllerLeadershipStatus,
-    TokenScope,
     last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient

From b719d58863f90beb46218d1167e4236b05910924 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 17 Sep 2024 09:25:42 +0100
Subject: [PATCH 066/142] storcon: forward requests from stepped down instance
 to the current leader (#8954)

## Problem
It turns out that we can't rely on external orchestration to promptly
route trafic to the new leader. This is downtime inducing.
Forwarding provides a safe way out.

## Safety
We forward when:
1. Request is not one of ["/control/v1/step_down", "/status", "/ready",
"/metrics"]
2. Current instance is in [`LeadershipStatus::SteppedDown`] state
3. There is a leader in the database to forward to
4. Leader from step (3) is not the current instance

If a storcon instance is persisted in the database, then we know that it
is the current leader.
There's one exception: time between handling step-down request and the
new leader updating the
database.

Let's treat the happy case first. The stepped down node does not produce
any side effects,
since all request handling happens on the leader.

As for the edge case, we are guaranteed to always have a maximum of two
running instances.
Hence, if we are in the edge case scenario the leader persisted in the
database is the
stepped down instance that received the request. Condition (4) above
covers this scenario.

## Summary of changes
* Conversion utilities for reqwest <-> hyper. I'm not happy with these,
but I don't see a better way. Open to suggestions.
* Add request forwarding logic
* Update each request handler. Again, not happy with this. If anyone
knows a nice to wrap the handlers, lmk. Me and Joonas tried :/
* Update each handler to maybe forward
* Tweak tests to showcase new behaviour
---
 storage_controller/src/http.rs                | 620 +++++++++++++++++-
 .../regress/test_storage_controller.py        |  22 +-
 2 files changed, 607 insertions(+), 35 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index a6638f5191..1745bf5575 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1,10 +1,11 @@
+use crate::http;
 use crate::metrics::{
     HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
     METRICS_REGISTRY,
 };
 use crate::persistence::SafekeeperPersistence;
 use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
@@ -22,6 +23,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -87,9 +89,16 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 }
 
 /// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_re_attach(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -97,9 +106,16 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.validate(validate_req).await?)
@@ -108,9 +124,16 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// Call into this before attaching a tenant to a pageserver, to acquire a generation number
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
-async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_attach_hook(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -124,9 +147,16 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
     )
 }
 
-async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_inspect(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let inspect_req = json_request::<InspectRequest>(&mut req).await?;
 
     let state = get_state(&req);
@@ -136,10 +166,17 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
 
 async fn handle_tenant_create(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
 
     json_response(
@@ -150,11 +187,18 @@ async fn handle_tenant_create(
 
 async fn handle_tenant_location_config(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
@@ -166,10 +210,17 @@ async fn handle_tenant_location_config(
 
 async fn handle_tenant_config_set(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
 
     json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
@@ -182,16 +233,30 @@ async fn handle_tenant_config_get(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
 }
 
 async fn handle_tenant_time_travel_remote_storage(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
 
     let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -232,6 +297,13 @@ async fn handle_tenant_secondary_download(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
     json_response(map_reqwest_hyper_status(status)?, progress)
 }
@@ -243,6 +315,13 @@ async fn handle_tenant_delete(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     let status_code = service
         .tenant_delete(tenant_id)
         .await
@@ -258,11 +337,18 @@ async fn handle_tenant_delete(
 
 async fn handle_tenant_timeline_create(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
     json_response(
         StatusCode::CREATED,
@@ -277,9 +363,16 @@ async fn handle_tenant_timeline_delete(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
 
     // For timeline deletions, which both implement an "initially return 202, then 404 once
     // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
@@ -337,12 +430,19 @@ async fn handle_tenant_timeline_delete(
 
 async fn handle_tenant_timeline_archival_config(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
 
     let create_req = json_request::<TimelineArchivalConfigRequest>(&mut req).await?;
 
@@ -358,9 +458,16 @@ async fn handle_tenant_timeline_detach_ancestor(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
 
     let res = service
         .tenant_timeline_detach_ancestor(tenant_id, timeline_id)
@@ -393,6 +500,13 @@ async fn handle_tenant_timeline_passthrough(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let Some(path) = req.uri().path_and_query() else {
         // This should never happen, our request router only calls us if there is a path
         return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
@@ -460,9 +574,17 @@ async fn handle_tenant_locate(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
     check_permissions(&req, Scope::Admin)?;
 
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
@@ -473,6 +595,14 @@ async fn handle_tenant_describe(
     check_permissions(&req, Scope::Scrubber)?;
 
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
@@ -482,12 +612,26 @@ async fn handle_tenant_list(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_list())
 }
 
-async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
     let state = get_state(&req);
     state.service.node_register(register_req).await?;
@@ -497,6 +641,13 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let nodes = state.service.node_list().await?;
     let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
@@ -507,6 +658,13 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
@@ -515,14 +673,28 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
 }
 
-async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_configure(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
     if node_id != config_req.node_id {
@@ -548,6 +720,13 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -570,6 +749,13 @@ async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiErr
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let leader = state.service.get_leader().await.map_err(|err| {
         ApiError::InternalServerError(anyhow::anyhow!(
@@ -583,6 +769,13 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -594,6 +787,13 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -605,6 +805,13 @@ async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -616,6 +823,13 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -624,9 +838,16 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
-async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Scrubber)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -640,6 +861,13 @@ async fn handle_metadata_health_list_unhealthy(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
 
@@ -652,10 +880,17 @@ async fn handle_metadata_health_list_unhealthy(
 }
 
 async fn handle_metadata_health_list_outdated(
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
     let state = get_state(&req);
     let health_records = state
@@ -671,10 +906,17 @@ async fn handle_metadata_health_list_outdated(
 
 async fn handle_tenant_shard_split(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
 
@@ -686,10 +928,17 @@ async fn handle_tenant_shard_split(
 
 async fn handle_tenant_shard_migrate(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
     json_response(
@@ -700,9 +949,16 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
-async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
     let state = get_state(&req);
@@ -716,9 +972,16 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
-async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -731,23 +994,46 @@ async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.step_down().await)
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
 async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(
@@ -759,6 +1045,13 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     state.service.tenants_dump()
 }
@@ -766,6 +1059,13 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     state.service.scheduler_dump()
 }
@@ -773,6 +1073,13 @@ async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, Api
 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -781,19 +1088,40 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
 }
 
 /// Status endpoint is just used for checking that our HTTP listener is up
-async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, ())
 }
 
 /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
 /// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
 async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     if state.service.startup_complete.is_ready() {
         json_response(StatusCode::OK, ())
@@ -816,6 +1144,13 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
 
     let id = parse_request_param::<i64>(&req, "id")?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     let res = state.service.get_safekeeper(id).await;
@@ -847,6 +1182,13 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
         )));
     }
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     state.service.upsert_safekeeper(body).await?;
@@ -925,10 +1267,7 @@ pub fn prologue_leadership_status_check_middleware<
 
         let allowed_routes = match leadership_status {
             LeadershipStatus::Leader => AllowedRoutes::All,
-            LeadershipStatus::SteppedDown => {
-                // TODO: does it make sense to allow /status here?
-                AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
-            }
+            LeadershipStatus::SteppedDown => AllowedRoutes::All,
             LeadershipStatus::Candidate => {
                 AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
             }
@@ -1005,6 +1344,13 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
 pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
     let response = Response::builder()
@@ -1032,6 +1378,220 @@ where
     request_span(request, handler).await
 }
 
+enum ForwardOutcome {
+    Forwarded(Result<Response<Body>, ApiError>),
+    NotForwarded(Request<Body>),
+}
+
+/// Potentially forward the request to the current storage controler leader.
+/// More specifically we forward when:
+/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"]
+/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state
+/// 3. There is a leader in the database to forward to
+/// 4. Leader from step (3) is not the current instance
+///
+/// Why forward?
+/// It turns out that we can't rely on external orchestration to promptly route trafic to the
+/// new leader. This is downtime inducing. Forwarding provides a safe way out.
+///
+/// Why is it safe?
+/// If a storcon instance is persisted in the database, then we know that it is the current leader.
+/// There's one exception: time between handling step-down request and the new leader updating the
+/// database.
+///
+/// Let's treat the happy case first. The stepped down node does not produce any side effects,
+/// since all request handling happens on the leader.
+///
+/// As for the edge case, we are guaranteed to always have a maximum of two running instances.
+/// Hence, if we are in the edge case scenario the leader persisted in the database is the
+/// stepped down instance that received the request. Condition (4) above covers this scenario.
+async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
+    const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"];
+
+    let uri = req.uri().to_string();
+    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
+
+    let state = get_state(&req);
+    let leadership_status = state.service.get_leadership_status();
+
+    if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward {
+        return ForwardOutcome::NotForwarded(req);
+    }
+
+    let leader = state.service.get_leader().await;
+    let leader = {
+        match leader {
+            Ok(Some(leader)) => leader,
+            Ok(None) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable(
+                    "No leader to forward to while in stepped down state".into(),
+                )));
+            }
+            Err(err) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
+                    anyhow::anyhow!(
+                        "Failed to get leader for forwarding while in stepped down state: {err}"
+                    ),
+                )));
+            }
+        }
+    };
+
+    let cfg = state.service.get_config();
+    if let Some(ref self_addr) = cfg.address_for_peers {
+        let leader_addr = match Uri::from_str(leader.address.as_str()) {
+            Ok(uri) => uri,
+            Err(err) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
+                    anyhow::anyhow!(
+                    "Failed to parse leader uri for forwarding while in stepped down state: {err}"
+                ),
+                )));
+            }
+        };
+
+        if *self_addr == leader_addr {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Leader is stepped down instance"
+            ))));
+        }
+    }
+
+    tracing::info!("Forwarding {} to leader at {}", uri, leader.address);
+
+    // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and
+    // include some leeway to get the timeout for proxied requests.
+    const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10);
+    let client = reqwest::ClientBuilder::new()
+        .timeout(PROXIED_REQUEST_TIMEOUT)
+        .build();
+    let client = match client {
+        Ok(client) => client,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to build leader client for forwarding while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    let request: reqwest::Request = match convert_request(req, &client, leader.address).await {
+        Ok(r) => r,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to convert request for forwarding while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    let response = match client.execute(request).await {
+        Ok(r) => r,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to forward while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    ForwardOutcome::Forwarded(convert_response(response).await)
+}
+
+/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through
+/// a stable representation (string, bytes or integer)
+///
+/// Ideally, we would not have to do this since both types use the http crate
+/// under the hood. However, they use different versions of the crate and keeping
+/// second order dependencies in sync is difficult.
+async fn convert_response(resp: reqwest::Response) -> Result<hyper::Response<Body>, ApiError> {
+    use std::str::FromStr;
+
+    let mut builder = hyper::Response::builder().status(resp.status().as_u16());
+    for (key, value) in resp.headers().into_iter() {
+        let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+        })?;
+
+        let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+        })?;
+
+        builder = builder.header(key, value);
+    }
+
+    let body = http::Body::wrap_stream(resp.bytes_stream());
+
+    builder.body(body).map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+    })
+}
+
+/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through
+/// a stable representation (string, bytes or integer)
+///
+/// See [`convert_response`] for why we are doing it this way.
+async fn convert_request(
+    req: hyper::Request<Body>,
+    client: &reqwest::Client,
+    to_address: String,
+) -> Result<reqwest::Request, ApiError> {
+    use std::str::FromStr;
+
+    let (parts, body) = req.into_parts();
+    let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    let path_and_query = parts.uri.path_and_query().ok_or_else(|| {
+        ApiError::InternalServerError(anyhow::anyhow!(
+            "Request conversion failed: no path and query"
+        ))
+    })?;
+
+    let uri = reqwest::Url::from_str(
+        format!(
+            "{}{}",
+            to_address.trim_end_matches("/"),
+            path_and_query.as_str()
+        )
+        .as_str(),
+    )
+    .map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    let mut headers = reqwest::header::HeaderMap::new();
+    for (key, value) in parts.headers.into_iter() {
+        let key = match key {
+            Some(k) => k,
+            None => {
+                continue;
+            }
+        };
+
+        let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })?;
+
+        let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })?;
+
+        headers.insert(key, value);
+    }
+
+    let body = hyper::body::to_bytes(body).await.map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    client
+        .request(method, uri)
+        .headers(headers)
+        .body(body)
+        .build()
+        .map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2e21f8fb46..2d72dbb2df 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2048,8 +2048,11 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     # Make a change to the tenant config to trigger a slow reconcile
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
     virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
-    env.storage_controller.allowed_errors.append(
-        ".*Accepted configuration update but reconciliation failed.*"
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Accepted configuration update but reconciliation failed.*",
+            ".*Leader is stepped down instance",
+        ]
     )
 
     observed_state = env.storage_controller.step_down()
@@ -2072,9 +2075,9 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     assert "compaction_threshold" in ps_tenant_conf.effective_config
     assert ps_tenant_conf.effective_config["compaction_threshold"] == 5
 
-    # Validate that the storcon is not replying to the usual requests
-    # once it has stepped down.
-    with pytest.raises(StorageControllerApiException, match="stepped_down"):
+    # Validate that the storcon attempts to forward the request, but stops.
+    # when it realises it is still the current leader.
+    with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"):
         env.storage_controller.tenant_list()
 
     # Validate that we can step down multiple times and the observed state
@@ -2221,6 +2224,15 @@ def test_storage_controller_leadership_transfer(
     env.storage_controller.wait_until_ready()
     env.storage_controller.consistency_check()
 
+    if not step_down_times_out:
+        # Check that the stepped down instance forwards requests
+        # to the new leader while it's still running.
+        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+        env.storage_controller.tenant_list()
+        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+        status = env.storage_controller.node_status(env.pageservers[0].id)
+        assert status["scheduling"] == "Pause"
+
     if step_down_times_out:
         env.storage_controller.allowed_errors.extend(
             [

From cd4276fd656b9e917ee315240d957d344fc2cfe6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 10:17:48 +0100
Subject: [PATCH 067/142] CI: fix release pipeline (#9017)

## Problem

We've got 2 non-blocking failures on the release pipeline:
- `promote-compatibility-data` job got skipped _presumably_ because one
of the dependencies of `deploy` job (`push-to-acr-dev`) got skipped
(https://github.com/neondatabase/neon/pull/8940)
- `coverage-report` job fails because we don't build debug artifacts in
the release branch (https://github.com/neondatabase/neon/pull/8561)

## Summary of changes
- Always run `push-to-acr-dev` / `push-to-acr-prod` jobs, but add
`skip_if` parameter to the reusable workflow, which can skip the job
internally, without skipping externally
- Do not run `coverage-report` on release branches
---
 .github/workflows/_push-to-acr.yml   | 8 +++++++-
 .github/workflows/build_and_test.yml | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
index 415b3d9cc6..7b6eba2c06 100644
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -26,9 +26,15 @@ on:
         description: Azure tenant ID
         required: true
         type: string
+      skip_if:
+        description: Skip the job if this expression is true
+        required: true
+        type: boolean
 
 jobs:
   push-to-acr:
+    if: ${{ !inputs.skip_if }}
+
     runs-on: ubuntu-22.04
     permissions:
       contents: read  # This is required for actions/checkout
@@ -52,5 +58,5 @@ jobs:
           for image in ${images}; do
             docker buildx imagetools create \
               -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                        neondatabase/${image}:${{ inputs.image_tag }}
+                                                        neondatabase/${image}:${{ inputs.image_tag }}
           done
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7c06fd9ab8..7ddd624dd5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -357,6 +357,7 @@ jobs:
             })
 
   coverage-report:
+    if: ${{ !startsWith(github.ref_name, 'release') }}
     needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
     runs-on: [ self-hosted, small ]
     container:
@@ -858,7 +859,6 @@ jobs:
           done
 
   push-to-acr-dev:
-    if: github.ref_name == 'main'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -868,9 +868,9 @@ jobs:
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
+      skip_if: ${{ github.ref_name != 'main' }}
 
   push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -880,6 +880,7 @@ jobs:
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
+      skip_if: ${{ !startsWith(github.ref_name, 'release') }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
@@ -957,7 +958,7 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest

From d211f00f054b80515684ff0887b76ecec60cc796 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 17:55:45 +0300
Subject: [PATCH 068/142] Remove unnecessary dependencies (#9000)

Found by "cargo machete"
---
 Cargo.lock                             | 144 ++-----------------------
 compute_tools/Cargo.toml               |   3 -
 control_plane/Cargo.toml               |   5 -
 control_plane/storcon_cli/Cargo.toml   |   3 -
 libs/compute_api/Cargo.toml            |   1 -
 libs/consumption_metrics/Cargo.toml    |   3 -
 libs/desim/Cargo.toml                  |   1 -
 libs/postgres_backend/Cargo.toml       |   2 -
 libs/postgres_ffi/Cargo.toml           |   3 -
 libs/postgres_ffi/wal_craft/Cargo.toml |   1 -
 libs/pq_proto/Cargo.toml               |   2 -
 libs/remote_storage/Cargo.toml         |   3 -
 libs/safekeeper_api/Cargo.toml         |   1 -
 libs/tracing-utils/Cargo.toml          |   5 +-
 libs/utils/Cargo.toml                  |   1 -
 libs/vm_monitor/Cargo.toml             |   2 -
 pageserver/Cargo.toml                  |   9 --
 pageserver/compaction/Cargo.toml       |  22 ----
 pageserver/ctl/Cargo.toml              |   2 -
 proxy/Cargo.toml                       |   7 --
 safekeeper/Cargo.toml                  |   5 -
 storage_broker/Cargo.toml              |   2 -
 storage_controller/Cargo.toml          |   2 -
 storage_controller/client/Cargo.toml   |  11 --
 storage_scrubber/Cargo.toml            |   8 --
 workspace_hack/Cargo.toml              |   3 +
 26 files changed, 15 insertions(+), 236 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3ca6acbc3e..136f07956f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1209,7 +1209,6 @@ dependencies = [
  "remote_storage",
  "serde",
  "serde_json",
- "serde_with",
  "utils",
 ]
 
@@ -1218,7 +1217,6 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-compression",
  "bytes",
  "cfg-if",
  "chrono",
@@ -1237,7 +1235,6 @@ dependencies = [
  "reqwest 0.12.4",
  "rlimit",
  "rust-ini",
- "serde",
  "serde_json",
  "signal-hook",
  "tar",
@@ -1246,7 +1243,6 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
- "toml_edit",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -1317,12 +1313,9 @@ dependencies = [
 name = "consumption_metrics"
 version = "0.1.0"
 dependencies = [
- "anyhow",
  "chrono",
  "rand 0.8.5",
  "serde",
- "serde_with",
- "utils",
 ]
 
 [[package]]
@@ -1334,9 +1327,7 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
- "futures",
  "git-version",
- "hex",
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
@@ -1344,7 +1335,6 @@ dependencies = [
  "once_cell",
  "pageserver_api",
  "pageserver_client",
- "postgres",
  "postgres_backend",
  "postgres_connection",
  "regex",
@@ -1353,9 +1343,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
- "serde_with",
  "storage_broker",
- "tar",
  "thiserror",
  "tokio",
  "tokio-postgres",
@@ -1663,7 +1651,6 @@ dependencies = [
  "hex",
  "parking_lot 0.12.1",
  "rand 0.8.5",
- "scopeguard",
  "smallvec",
  "tracing",
  "utils",
@@ -2233,24 +2220,22 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "git-version"
-version = "0.3.5"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899"
+checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19"
 dependencies = [
  "git-version-macro",
- "proc-macro-hack",
 ]
 
 [[package]]
 name = "git-version-macro"
-version = "0.3.5"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f"
+checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
- "proc-macro-hack",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2744,19 +2729,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "inotify"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
-dependencies = [
- "bitflags 1.3.2",
- "futures-core",
- "inotify-sys",
- "libc",
- "tokio",
-]
-
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -3251,7 +3223,7 @@ dependencies = [
  "crossbeam-channel",
  "filetime",
  "fsevent-sys",
- "inotify 0.9.6",
+ "inotify",
  "kqueue",
  "libc",
  "log",
@@ -3642,7 +3614,6 @@ name = "pagectl"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bytes",
  "camino",
  "clap",
  "git-version",
@@ -3651,7 +3622,6 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
- "serde",
  "serde_json",
  "svg_fmt",
  "thiserror",
@@ -3670,7 +3640,6 @@ dependencies = [
  "arc-swap",
  "async-compression",
  "async-stream",
- "async-trait",
  "bit_field",
  "byteorder",
  "bytes",
@@ -3678,16 +3647,13 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
- "const_format",
  "consumption_metrics",
  "crc32c",
  "criterion",
- "crossbeam-utils",
  "either",
  "enum-map",
  "enumset",
  "fail",
- "flate2",
  "futures",
  "git-version",
  "hex",
@@ -3726,13 +3692,9 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "serde_with",
- "signal-hook",
- "smallvec",
  "storage_broker",
  "strum",
  "strum_macros",
- "svg_fmt",
- "sync_wrapper",
  "sysinfo",
  "tenant_size_model",
  "thiserror",
@@ -3746,7 +3708,6 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
- "twox-hash",
  "url",
  "utils",
  "walkdir",
@@ -3810,44 +3771,22 @@ name = "pageserver_compaction"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-compression",
  "async-stream",
- "byteorder",
- "bytes",
- "chrono",
  "clap",
- "const_format",
- "consumption_metrics",
  "criterion",
- "crossbeam-utils",
- "either",
- "fail",
- "flate2",
  "futures",
  "git-version",
- "hex",
  "hex-literal",
- "humantime",
- "humantime-serde",
  "itertools 0.10.5",
- "metrics",
  "once_cell",
  "pageserver_api",
  "pin-project-lite",
  "rand 0.8.5",
- "smallvec",
  "svg_fmt",
- "sync_wrapper",
- "thiserror",
  "tokio",
- "tokio-io-timeout",
- "tokio-util",
  "tracing",
- "tracing-error",
  "tracing-subscriber",
- "url",
  "utils",
- "walkdir",
  "workspace_hack",
 ]
 
@@ -4164,9 +4103,7 @@ name = "postgres_backend"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "bytes",
- "futures",
  "once_cell",
  "pq_proto",
  "rustls 0.22.4",
@@ -4199,16 +4136,13 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "bindgen",
- "byteorder",
  "bytes",
  "crc32c",
  "env_logger",
- "hex",
  "log",
  "memoffset 0.8.0",
  "once_cell",
  "postgres",
- "rand 0.8.5",
  "regex",
  "serde",
  "thiserror",
@@ -4243,13 +4177,11 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
  "serde",
  "thiserror",
  "tokio",
- "tracing",
 ]
 
 [[package]]
@@ -4281,12 +4213,6 @@ dependencies = [
  "elliptic-curve 0.13.8",
 ]
 
-[[package]]
-name = "proc-macro-hack"
-version = "0.5.20+deprecated"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
-
 [[package]]
 name = "proc-macro2"
 version = "1.0.78"
@@ -4405,7 +4331,6 @@ dependencies = [
  "aws-config",
  "aws-sdk-iam",
  "aws-sigv4",
- "aws-types",
  "base64 0.13.1",
  "bstr",
  "bytes",
@@ -4414,7 +4339,6 @@ dependencies = [
  "chrono",
  "clap",
  "consumption_metrics",
- "crossbeam-deque",
  "dashmap",
  "ecdsa 0.16.9",
  "env_logger",
@@ -4440,11 +4364,9 @@ dependencies = [
  "jose-jwa",
  "jose-jwk",
  "lasso",
- "md5",
  "measured",
  "metrics",
  "once_cell",
- "opentelemetry",
  "p256 0.13.2",
  "parking_lot 0.12.1",
  "parquet",
@@ -4465,7 +4387,6 @@ dependencies = [
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
- "routerify",
  "rsa",
  "rstest",
  "rustc-hash",
@@ -4481,7 +4402,6 @@ dependencies = [
  "smol_str",
  "socket2 0.5.5",
  "subtle",
- "task-local-extensions",
  "thiserror",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
@@ -4491,7 +4411,6 @@ dependencies = [
  "tokio-rustls 0.25.0",
  "tokio-tungstenite",
  "tokio-util",
- "tower-service",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -4781,7 +4700,6 @@ dependencies = [
  "async-stream",
  "async-trait",
  "aws-config",
- "aws-credential-types",
  "aws-sdk-s3",
  "aws-smithy-async",
  "aws-smithy-types",
@@ -4795,7 +4713,6 @@ dependencies = [
  "futures",
  "futures-util",
  "http-types",
- "humantime",
  "humantime-serde",
  "hyper 0.14.26",
  "itertools 0.10.5",
@@ -5275,14 +5192,12 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-stream",
- "async-trait",
  "byteorder",
  "bytes",
  "camino",
  "camino-tempfile",
  "chrono",
  "clap",
- "const_format",
  "crc32c",
  "desim",
  "fail",
@@ -5308,9 +5223,7 @@ dependencies = [
  "sd-notify",
  "serde",
  "serde_json",
- "serde_with",
  "sha2",
- "signal-hook",
  "storage_broker",
  "strum",
  "strum_macros",
@@ -5321,7 +5234,6 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -5336,7 +5248,6 @@ version = "0.1.0"
 dependencies = [
  "const_format",
  "serde",
- "serde_with",
  "utils",
 ]
 
@@ -5865,7 +5776,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-stream",
- "bytes",
  "clap",
  "const_format",
  "futures",
@@ -5879,7 +5789,6 @@ dependencies = [
  "parking_lot 0.12.1",
  "prost",
  "tokio",
- "tokio-stream",
  "tonic",
  "tonic-build",
  "tracing",
@@ -5892,9 +5801,7 @@ name = "storage_controller"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "aws-config",
  "bytes",
- "camino",
  "chrono",
  "clap",
  "control_plane",
@@ -5935,20 +5842,9 @@ dependencies = [
 name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
- "anyhow",
- "bytes",
- "futures",
- "pageserver_api",
  "pageserver_client",
- "postgres",
  "reqwest 0.12.4",
  "serde",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-stream",
- "tokio-util",
- "utils",
  "workspace_hack",
 ]
 
@@ -5960,13 +5856,9 @@ dependencies = [
  "async-stream",
  "aws-config",
  "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
  "camino",
  "chrono",
  "clap",
- "crc32c",
  "either",
  "futures",
  "futures-util",
@@ -5978,20 +5870,16 @@ dependencies = [
  "pageserver",
  "pageserver_api",
  "postgres_ffi",
- "rand 0.8.5",
  "remote_storage",
  "reqwest 0.12.4",
  "rustls 0.22.4",
  "rustls-native-certs 0.7.0",
  "serde",
  "serde_json",
- "serde_with",
  "storage_controller_client",
- "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
  "tokio-stream",
  "tokio-util",
  "tracing",
@@ -6010,14 +5898,11 @@ dependencies = [
  "comfy-table",
  "futures",
  "humantime",
- "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
  "reqwest 0.12.4",
- "serde",
  "serde_json",
  "storage_controller_client",
- "thiserror",
  "tokio",
  "tracing",
  "utils",
@@ -6140,15 +6025,6 @@ dependencies = [
  "xattr",
 ]
 
-[[package]]
-name = "task-local-extensions"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8"
-dependencies = [
- "pin-utils",
-]
-
 [[package]]
 name = "tempfile"
 version = "3.9.0"
@@ -6739,7 +6615,6 @@ dependencies = [
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
- "reqwest 0.12.4",
  "tokio",
  "tracing",
  "tracing-opentelemetry",
@@ -6943,7 +6818,6 @@ dependencies = [
  "serde_assert",
  "serde_json",
  "serde_path_to_error",
- "serde_with",
  "signal-hook",
  "strum",
  "strum_macros",
@@ -6999,13 +6873,11 @@ dependencies = [
  "cgroups-rs",
  "clap",
  "futures",
- "inotify 0.10.2",
  "serde",
  "serde_json",
  "sysinfo",
  "tokio",
  "tokio-postgres",
- "tokio-stream",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
@@ -7032,7 +6904,6 @@ dependencies = [
  "clap",
  "env_logger",
  "log",
- "once_cell",
  "postgres",
  "postgres_ffi",
  "regex",
@@ -7555,6 +7426,7 @@ dependencies = [
  "digest",
  "either",
  "fail",
+ "futures",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7610,6 +7482,8 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
  "url",
  "uuid",
  "zeroize",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8af0ed43ce..00a82e4be6 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,7 +11,6 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
@@ -24,7 +23,6 @@ num_cpus.workspace = true
 opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
-serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
@@ -43,7 +41,6 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
-toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 6fca59b368..c185d20484 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,13 +9,10 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-futures.workspace = true
 git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
-postgres.workspace = true
-hex.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
@@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
-tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index be69208d0d..ce89116691 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -11,14 +11,11 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
-hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
-serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
-thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
index 8aaa481f8c..c0ec40a6c2 100644
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 serde.workspace = true
-serde_with.workspace = true
 serde_json.workspace = true
 regex.workspace = true
 
diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml
index a40b74b952..0e517e3856 100644
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -5,9 +5,6 @@ edition = "2021"
 license = "Apache-2.0"
 
 [dependencies]
-anyhow.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
-serde_with.workspace = true
-utils.workspace = true
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
index 0c4be90267..473f3a2a13 100644
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -12,5 +12,4 @@ bytes.workspace = true
 utils.workspace = true
 parking_lot.workspace = true
 hex.workspace = true
-scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index f6854328fc..a0c87263ed 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -5,10 +5,8 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-async-trait.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
-futures.workspace = true
 rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index ee69878f69..ef17833a48 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -5,13 +5,10 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-rand.workspace = true
 regex.workspace = true
 bytes.workspace = true
-byteorder.workspace = true
 anyhow.workspace = true
 crc32c.workspace = true
-hex.workspace = true
 once_cell.workspace = true
 log.workspace = true
 memoffset.workspace = true
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 29dd01a936..14c7d2e340 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -9,7 +9,6 @@ anyhow.workspace = true
 clap.workspace = true
 env_logger.workspace = true
 log.workspace = true
-once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 66bbe03ebc..9524a1490d 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -8,10 +8,8 @@ license.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
 itertools.workspace = true
-pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
-tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 02adee058f..f48f1801a4 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,14 +13,11 @@ aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
-aws-credential-types.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
-humantime.workspace = true
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
-rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index e1f4bcca46..14811232d3 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -6,6 +6,5 @@ license.workspace = true
 
 [dependencies]
 serde.workspace = true
-serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 5ea8db6b42..05eb538d42 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -9,8 +9,9 @@ hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
 opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
-reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
-tracing-subscriber.workspace = true
+
+[dev-dependencies]
+tracing-subscriber.workspace = true    # For examples in docs
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 19deaab63f..f199b15554 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -42,7 +42,6 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
-serde_with.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml
index 46e9f880a1..ba73902d38 100644
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -15,13 +15,11 @@ anyhow.workspace = true
 axum.workspace = true
 clap.workspace = true
 futures.workspace = true
-inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
-tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 24373afca3..0eb48d6823 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -15,7 +15,6 @@ anyhow.workspace = true
 arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
-async-trait.workspace = true
 bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
@@ -23,12 +22,9 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
-crossbeam-utils.workspace = true
 either.workspace = true
-flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -57,10 +53,6 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
-signal-hook.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
@@ -73,7 +65,6 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
-twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml
index 0fd1d81845..52b58fc298 100644
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -9,41 +9,19 @@ default = []
 
 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 async-stream.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
 itertools.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pin-project-lite.workspace = true
 rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
 tracing.workspace = true
-tracing-error.workspace = true
 tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index be5626040b..9592002de1 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
-bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
@@ -24,5 +23,4 @@ toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
-serde.workspace = true
 serde_json.workspace = true
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 21d92abb20..6703eb06eb 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -18,7 +18,6 @@ atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
-aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -26,7 +25,6 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
-crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -48,11 +46,9 @@ indexmap.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
-md5.workspace = true
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
-opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -67,7 +63,6 @@ reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
-routerify.workspace = true
 rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -79,7 +74,6 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
-task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
@@ -88,7 +82,6 @@ tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
-tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 0fdb3147bf..daf21c70b0 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -13,14 +13,12 @@ testing = ["fail/failpoints"]
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
-async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
 camino-tempfile.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
-const_format.workspace = true
 crc32c.workspace = true
 fail.workspace = true
 git-version.workspace = true
@@ -38,8 +36,6 @@ scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
-signal-hook.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
@@ -48,7 +44,6 @@ tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-tar.workspace = true
-toml_edit.workspace = true
 tracing.workspace = true
 url.workspace = true
 metrics.workspace = true
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index ac4b00669e..82ec0aa272 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -10,7 +10,6 @@ bench = []
 [dependencies]
 anyhow.workspace = true
 async-stream.workspace = true
-bytes.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 futures.workspace = true
@@ -24,7 +23,6 @@ parking_lot.workspace = true
 prost.workspace = true
 tonic.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
-tokio-stream.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index ecaac04915..a96d64e096 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -15,9 +15,7 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
-aws-config.workspace = true
 bytes.workspace = true
-camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml
index e7a4264fd0..9fa89176af 100644
--- a/storage_controller/client/Cargo.toml
+++ b/storage_controller/client/Cargo.toml
@@ -5,18 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-pageserver_api.workspace = true
 pageserver_client.workspace = true
-thiserror.workspace = true
 reqwest.workspace = true
-utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
-tokio-postgres.workspace = true
-tokio-stream.workspace = true
-tokio.workspace = true
-futures.workspace = true
-tokio-util.workspace = true
-anyhow.workspace = true
-postgres.workspace = true
-bytes.workspace = true
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index d19119990b..f9987662b9 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -6,21 +6,13 @@ license.workspace = true
 
 [dependencies]
 aws-sdk-s3.workspace = true
-aws-smithy-async.workspace = true
 either.workspace = true
-tokio-rustls.workspace = true
 anyhow.workspace = true
 git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
-thiserror.workspace = true
-rand.workspace = true
-bytes.workspace = true
-bincode.workspace = true
-crc32c.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 140c43639e..662916d42c 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,6 +38,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -88,6 +89,8 @@ tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive", "serde"] }

From 6138eb50e9fc5fb6bc7af57cb95cbad8aaedc186 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 16 Sep 2024 17:53:57 -0500
Subject: [PATCH 069/142] Fix test code related to migrations

We added another migration in 5876c441abc973acca60882192ad46333c075abd,
but didn't bump this value. This had no effect, but best to fix it
anyway.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 93b93ff019..69284bfdfc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4094,7 +4094,7 @@ class Endpoint(PgProtocol, LogUtils):
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     # Please note: Migrations only run if pg_skip_catalog_updates is false
-    def wait_for_migrations(self, num_migrations: int = 10):
+    def wait_for_migrations(self, num_migrations: int = 11):
         with self.cursor() as cur:
 
             def check_migrations_done():

From a1b71b73fe8f1193ef841604af962053c0bf1061 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 17 Sep 2024 19:15:01 +0200
Subject: [PATCH 070/142] Rename some S3 usages to "remote storage" in exposed
 messages (#8999)

In exposed messages like log messages we mentioned "S3", which is not
entirely accurate as we support Azure blob storage now as well.
---
 pageserver/src/consumption_metrics.rs                    | 2 +-
 pageserver/src/metrics.rs                                | 2 +-
 test_runner/fixtures/pageserver/many_tenants.py          | 2 +-
 test_runner/regress/test_pageserver_metric_collection.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 64a267e0e4..0c7630edca 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -178,7 +178,7 @@ async fn collect_metrics(
                 )
                 .await;
                 if let Err(e) = res {
-                    tracing::error!("failed to upload to S3: {e:#}");
+                    tracing::error!("failed to upload to remote storage: {e:#}");
                 }
             }
         };
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9197505876..72229d80be 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1777,7 +1777,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
     .expect("failed to define a metric"),
     upload_heatmap_duration: register_histogram!(
         "pageserver_secondary_upload_heatmap_duration",
-        "Time to build and upload a heatmap, including any waiting inside the S3 client"
+        "Time to build and upload a heatmap, including any waiting inside the remote storage client"
     )
     .expect("failed to define a metric"),
     download_heatmap: register_int_counter!(
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 3e0ffabf74..97e63ed4ba 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -39,7 +39,7 @@ def single_timeline(
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
 
-    log.info(f"duplicating template tenant {ncopies} times in S3")
+    log.info(f"duplicating template tenant {ncopies} times in remote storage")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
 
     # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 24a37b04ec..37ab51f9fb 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -74,7 +74,7 @@ def test_metric_collection(
     env.pageserver.allowed_errors.extend(
         [
             ".*metrics endpoint refused the sent metrics*",
-            ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
+            ".*metrics_collection: failed to upload to remote storage: Failed to upload data of length .* to storage path.*",
         ]
     )
 

From d78f5ce6da2866cb982203d1bab33c49a990cd5f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 18:40:05 +0100
Subject: [PATCH 071/142] CI: don't fetch the whole git history if it's not
 required (#9021)

## Problem
We do use `actions/checkout` with `fetch-depth: 0` when it's not
required

## Summary of changes
- Remove unneeded `fetch-depth: 0`
- Add a comment if `fetch-depth: 0` is required
---
 .github/workflows/build_and_test.yml    | 31 +++++++------------------
 .github/workflows/trigger-e2e-tests.yml |  4 ++--
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7ddd624dd5..d46b8dc1f5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -54,8 +54,8 @@ jobs:
       build-tag: ${{steps.build-tag.outputs.tag}}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -374,8 +374,8 @@ jobs:
         coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
         coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
+      - uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
@@ -476,11 +476,9 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 0
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
@@ -555,11 +553,9 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 0
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
@@ -706,10 +702,7 @@ jobs:
       VM_BUILDER_VERSION: v0.29.3
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - name: Downloading vm-builder
         run: |
@@ -749,10 +742,7 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/login-action@v3
@@ -977,10 +967,7 @@ jobs:
             git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
           done
 
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - name: Trigger deploy workflow
         env:
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 6fbe785c56..b299cf9b99 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -34,8 +34,8 @@ jobs:
       build-tag: ${{ steps.build-tag.outputs.tag }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 

From 3cd2a3f9317f54e75c00a813680060dcaa612a36 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 17 Sep 2024 20:16:33 +0100
Subject: [PATCH 072/142] refactor(walredo): process launch & kill-on-error
 machinery (#8951)

Immediate benefit: easier to spot what's going on.

Later benefit: use the extracted method in PR

- https://github.com/neondatabase/neon/pull/8952

which adds a `ping` command to walredo.

Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.
---
 pageserver/src/walredo.rs | 255 +++++++++++++++++++++-----------------
 1 file changed, 140 insertions(+), 115 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a36955fa21..0fe7def8b0 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -35,6 +35,7 @@ use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
+use std::future::Future;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
@@ -296,6 +297,97 @@ impl PostgresRedoManager {
         }
     }
 
+    async fn do_with_walredo_process<
+        F: FnOnce(Arc<Process>) -> Fut,
+        Fut: Future<Output = Result<O, Error>>,
+        O,
+    >(
+        &self,
+        pg_version: u32,
+        closure: F,
+    ) -> Result<O, Error> {
+        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+            Ok(guard) => match &*guard {
+                ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                ProcessOnceCell::ManagerShutDown => {
+                    return Err(Error::Cancelled);
+                }
+            },
+            Err(permit) => {
+                let start = Instant::now();
+                // acquire guard before spawning process, so that we don't spawn new processes
+                // if the gate is already closed.
+                let _launched_processes_guard = match self.launched_processes.enter() {
+                    Ok(guard) => guard,
+                    Err(GateError::GateClosed) => unreachable!(
+                        "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                    ),
+                };
+                let proc = Arc::new(Process {
+                    process: process::WalRedoProcess::launch(
+                        self.conf,
+                        self.tenant_shard_id,
+                        pg_version,
+                    )
+                    .context("launch walredo process")?,
+                    _launched_processes_guard,
+                });
+                let duration = start.elapsed();
+                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                info!(
+                    elapsed_ms = duration.as_millis(),
+                    pid = proc.id(),
+                    "launched walredo process"
+                );
+                self.redo_process
+                    .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                proc
+            }
+        };
+
+        // async closures are unstable, would support &Process
+        let result = closure(proc.clone()).await;
+
+        if result.is_err() {
+            // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+            // Note that there may be other tasks concurrent with us that also hold `proc`.
+            // We have to deal with that here.
+            // Also read the doc comment on field `self.redo_process`.
+            //
+            // NB: there may still be other concurrent threads using `proc`.
+            // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
+            //
+            // NB: the drop impl blocks the dropping thread with a wait() system call for
+            // the child process. In some ways the blocking is actually good: if we
+            // deferred the waiting into the background / to tokio if we used `tokio::process`,
+            // it could happen that if walredo always fails immediately, we spawn processes faster
+            // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
+            // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
+            // This probably needs revisiting at some later point.
+            match self.redo_process.get() {
+                None => (),
+                Some(guard) => {
+                    match &*guard {
+                        ProcessOnceCell::ManagerShutDown => {}
+                        ProcessOnceCell::Spawned(guard_proc) => {
+                            if Arc::ptr_eq(&proc, guard_proc) {
+                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                guard.take_and_deinit();
+                            } else {
+                                // Another task already spawned another redo process (further up in this method)
+                                // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                            }
+                        }
+                    }
+                }
+            }
+            // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
+            drop(proc);
+        }
+
+        result
+    }
+
     ///
     /// Process one request for WAL redo using wal-redo postgres
     ///
@@ -319,130 +411,63 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
-                Ok(guard) => match &*guard {
-                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
-                    ProcessOnceCell::ManagerShutDown => {
-                        return Err(Error::Cancelled);
-                    }
-                },
-                Err(permit) => {
-                    let start = Instant::now();
-                    // acquire guard before spawning process, so that we don't spawn new processes
-                    // if the gate is already closed.
-                    let _launched_processes_guard = match self.launched_processes.enter() {
-                                Ok(guard) => guard,
-                                Err(GateError::GateClosed) => unreachable!(
-                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
-                                ),
-                            };
-                    let proc = Arc::new(Process {
-                        process: process::WalRedoProcess::launch(
-                            self.conf,
-                            self.tenant_shard_id,
-                            pg_version,
-                        )
-                        .context("launch walredo process")?,
-                        _launched_processes_guard,
-                    });
-                    let duration = start.elapsed();
-                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                    info!(
-                        duration_ms = duration.as_millis(),
-                        pid = proc.id(),
-                        "launched walredo process"
-                    );
-                    self.redo_process
-                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
-                    proc
-                }
-            };
+            let base_img = &base_img;
+            let closure = |proc: Arc<Process>| async move {
+                let started_at = std::time::Instant::now();
 
-            let started_at = std::time::Instant::now();
+                // Relational WAL records are applied using wal-redo-postgres
+                let result = proc
+                    .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
+                    .context("apply_wal_records");
 
-            // Relational WAL records are applied using wal-redo-postgres
-            let result = proc
-                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
-                .await
-                .context("apply_wal_records");
+                let duration = started_at.elapsed();
 
-            let duration = started_at.elapsed();
-
-            let len = records.len();
-            let nbytes = records.iter().fold(0, |acumulator, record| {
-                acumulator
-                    + match &record.1 {
-                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                    }
-            });
-
-            WAL_REDO_TIME.observe(duration.as_secs_f64());
-            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
-
-            debug!(
-                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-                len,
-                nbytes,
-                duration.as_micros(),
-                lsn
-            );
-
-            // If something went wrong, don't try to reuse the process. Kill it, and
-            // next request will launch a new one.
-            if let Err(e) = result.as_ref() {
-                error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                    records.len(),
-                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                    nbytes,
-                    base_img_lsn,
-                    lsn,
-                    n_attempts,
-                    e,
-                );
-                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
-                // Note that there may be other tasks concurrent with us that also hold `proc`.
-                // We have to deal with that here.
-                // Also read the doc comment on field `self.redo_process`.
-                //
-                // NB: there may still be other concurrent threads using `proc`.
-                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                //
-                // NB: the drop impl blocks the dropping thread with a wait() system call for
-                // the child process. In some ways the blocking is actually good: if we
-                // deferred the waiting into the background / to tokio if we used `tokio::process`,
-                // it could happen that if walredo always fails immediately, we spawn processes faster
-                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
-                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
-                // This probably needs revisiting at some later point.
-                match self.redo_process.get() {
-                    None => (),
-                    Some(guard) => {
-                        match &*guard {
-                            ProcessOnceCell::ManagerShutDown => {}
-                            ProcessOnceCell::Spawned(guard_proc) => {
-                                if Arc::ptr_eq(&proc, guard_proc) {
-                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                    guard.take_and_deinit();
-                                } else {
-                                    // Another task already spawned another redo process (further up in this method)
-                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
-                                }
-                            }
+                let len = records.len();
+                let nbytes = records.iter().fold(0, |acumulator, record| {
+                    acumulator
+                        + match &record.1 {
+                            NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                            _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
                         }
-                    }
+                });
+
+                WAL_REDO_TIME.observe(duration.as_secs_f64());
+                WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+                WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+
+                debug!(
+                    "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+                    len,
+                    nbytes,
+                    duration.as_micros(),
+                    lsn
+                );
+
+                if let Err(e) = result.as_ref() {
+                    error!(
+                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                        records.len(),
+                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                        nbytes,
+                        base_img_lsn,
+                        lsn,
+                        n_attempts,
+                        e,
+                    );
                 }
-                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
-                drop(proc);
-            } else if n_attempts != 0 {
+
+                result.map_err(Error::Other)
+            };
+            let result = self.do_with_walredo_process(pg_version, closure).await;
+
+            if result.is_ok() && n_attempts != 0 {
                 info!(n_attempts, "retried walredo succeeded");
             }
             n_attempts += 1;
             if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result.map_err(Error::Other);
+                return result;
             }
         }
     }

From 135e7e4306978d79a133086abb81707d8d9fb8a9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 18 Sep 2024 09:10:27 +0200
Subject: [PATCH 073/142] add `neon_local` subcommand for the broker & use that
 from regression tests (#8948)

There's currently no way to just start/stop broker from `neon_local`.

This PR
* adds a sub-command
* uses that sub-command from the test suite instead of the pre-existing
Python `subprocess` based approach.

Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.
---
 control_plane/src/bin/neon_local.rs           | 40 ++++++++++
 test_runner/fixtures/broker.py                | 63 ---------------
 test_runner/fixtures/neon_fixtures.py         | 76 +++++++++++++------
 test_runner/regress/test_neon_cli.py          |  2 +
 .../regress/test_pageserver_generations.py    |  2 +-
 .../regress/test_storage_controller.py        |  6 +-
 test_runner/regress/test_wal_acceptor.py      |  9 +--
 7 files changed, 99 insertions(+), 99 deletions(-)
 delete mode 100644 test_runner/fixtures/broker.py

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 144cd647c9..1c94c42865 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -106,6 +106,7 @@ fn main() -> Result<()> {
             "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
+            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
             "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
             "mappings" => handle_mappings(sub_args, &mut env),
@@ -1245,6 +1246,32 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
+async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match sub_match.subcommand() {
+        Some(broker_command_data) => broker_command_data,
+        None => bail!("no broker subcommand provided"),
+    };
+
+    match sub_name {
+        "start" => {
+            if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
+                eprintln!("broker start failed: {e}");
+                exit(1);
+            }
+        }
+
+        "stop" => {
+            if let Err(e) = broker::stop_broker_process(env) {
+                eprintln!("broker stop failed: {e}");
+                exit(1);
+            }
+        }
+
+        _ => bail!("Unexpected broker subcommand '{}'", sub_name),
+    }
+    Ok(())
+}
+
 async fn handle_start_all(
     env: &local_env::LocalEnv,
     retry_timeout: &Duration,
@@ -1672,6 +1699,19 @@ fn cli() -> Command {
                             .arg(stop_mode_arg.clone())
                             .arg(instance_id))
         )
+        .subcommand(
+            Command::new("storage_broker")
+                .arg_required_else_help(true)
+                .about("Manage broker")
+                .subcommand(Command::new("start")
+                            .about("Start broker")
+                            .arg(timeout_arg.clone())
+                )
+                .subcommand(Command::new("stop")
+                            .about("Stop broker")
+                            .arg(stop_mode_arg.clone())
+                )
+        )
         .subcommand(
             Command::new("safekeeper")
                 .arg_required_else_help(true)
diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py
deleted file mode 100644
index 8aca90a097..0000000000
--- a/test_runner/fixtures/broker.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import subprocess
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Optional
-
-from fixtures.log_helper import log
-
-
-@dataclass
-class NeonBroker:
-    """An object managing storage_broker instance"""
-
-    logfile: Path
-    port: int
-    neon_binpath: Path
-    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
-
-    def listen_addr(self):
-        return f"127.0.0.1:{self.port}"
-
-    def client_url(self):
-        return f"http://{self.listen_addr()}"
-
-    def check_status(self):
-        return True  # TODO
-
-    def try_start(self):
-        if self.handle is not None:
-            log.debug(f"storage_broker is already running on port {self.port}")
-            return
-
-        listen_addr = self.listen_addr()
-        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
-        with open(self.logfile, "wb") as logfile:
-            args = [
-                str(self.neon_binpath / "storage_broker"),
-                f"--listen-addr={listen_addr}",
-            ]
-            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
-
-        # wait for start
-        started_at = time.time()
-        while True:
-            try:
-                self.check_status()
-            except Exception as e:
-                elapsed = time.time() - started_at
-                if elapsed > 5:
-                    raise RuntimeError(
-                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
-                    ) from e
-                time.sleep(0.5)
-            else:
-                break  # success
-
-    def stop(self, immediate: bool = False):
-        if self.handle is not None:
-            if immediate:
-                self.handle.kill()
-            else:
-                self.handle.terminate()
-            self.handle.wait()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 69284bfdfc..cbbb162cc6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -60,7 +60,6 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.auth_tokens import AuthKeys, TokenScope
-from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
@@ -230,21 +229,6 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib
     return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)
 
 
-@pytest.fixture(scope="function")
-def default_broker(
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    neon_binpath: Path,
-) -> Iterator[NeonBroker]:
-    # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
-    client_port = port_distributor.get_port()
-    broker_logfile = test_output_dir / "repo" / "storage_broker.log"
-
-    broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
-    yield broker
-    broker.stop()
-
-
 @pytest.fixture(scope="session")
 def run_id() -> Iterator[uuid.UUID]:
     yield uuid.uuid4()
@@ -387,7 +371,6 @@ class NeonEnvBuilder:
         self,
         repo_dir: Path,
         port_distributor: PortDistributor,
-        broker: NeonBroker,
         run_id: uuid.UUID,
         mock_s3_server: MockS3Server,
         neon_binpath: Path,
@@ -428,7 +411,6 @@ class NeonEnvBuilder:
         # Safekeepers remote storage
         self.safekeepers_remote_storage: Optional[RemoteStorage] = None
 
-        self.broker = broker
         self.run_id = run_id
         self.mock_s3_server: MockS3Server = mock_s3_server
         self.pageserver_config_override = pageserver_config_override
@@ -940,6 +922,8 @@ class NeonEnvBuilder:
 
             self.env.storage_controller.assert_no_errors()
 
+            self.env.broker.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -993,7 +977,7 @@ class NeonEnv:
         self.endpoints = EndpointFactory(self)
         self.safekeepers: List[Safekeeper] = []
         self.pageservers: List[NeonPageserver] = []
-        self.broker = config.broker
+        self.broker = NeonBroker(self)
         self.pageserver_remote_storage = config.pageserver_remote_storage
         self.safekeepers_remote_storage = config.safekeepers_remote_storage
         self.pg_version = config.pg_version
@@ -1168,7 +1152,7 @@ class NeonEnv:
             max_workers=2 + len(self.pageservers) + len(self.safekeepers)
         ) as executor:
             futs.append(
-                executor.submit(lambda: self.broker.try_start() or None)
+                executor.submit(lambda: self.broker.start() or None)
             )  # The `or None` is for the linter
 
             for pageserver in self.pageservers:
@@ -1225,7 +1209,7 @@ class NeonEnv:
                 pageserver.stop(immediate=immediate)
             except RuntimeError:
                 stop_later.append(pageserver)
-        self.broker.stop(immediate=immediate)
+        self.broker.stop()
 
         # TODO: for nice logging we need python 3.11 ExceptionGroup
         for ps in stop_later:
@@ -1339,7 +1323,6 @@ def neon_simple_env(
     pytestconfig: Config,
     port_distributor: PortDistributor,
     mock_s3_server: MockS3Server,
-    default_broker: NeonBroker,
     run_id: uuid.UUID,
     top_output_dir: Path,
     test_output_dir: Path,
@@ -1364,7 +1347,6 @@ def neon_simple_env(
         top_output_dir=top_output_dir,
         repo_dir=repo_dir,
         port_distributor=port_distributor,
-        broker=default_broker,
         mock_s3_server=mock_s3_server,
         neon_binpath=neon_binpath,
         pg_distrib_dir=pg_distrib_dir,
@@ -1392,7 +1374,6 @@ def neon_env_builder(
     neon_binpath: Path,
     pg_distrib_dir: Path,
     pg_version: PgVersion,
-    default_broker: NeonBroker,
     run_id: uuid.UUID,
     request: FixtureRequest,
     test_overlay_dir: Path,
@@ -1428,7 +1409,6 @@ def neon_env_builder(
         neon_binpath=neon_binpath,
         pg_distrib_dir=pg_distrib_dir,
         pg_version=pg_version,
-        broker=default_broker,
         run_id=run_id,
         preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
@@ -1850,6 +1830,18 @@ class NeonCli(AbstractNeonCli):
             args.extend(["-m", "immediate"])
         return self.raw_cli(args)
 
+    def broker_start(
+        self, timeout_in_seconds: Optional[int] = None
+    ) -> "subprocess.CompletedProcess[str]":
+        cmd = ["storage_broker", "start"]
+        if timeout_in_seconds is not None:
+            cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        return self.raw_cli(cmd)
+
+    def broker_stop(self) -> "subprocess.CompletedProcess[str]":
+        cmd = ["storage_broker", "stop"]
+        return self.raw_cli(cmd)
+
     def endpoint_create(
         self,
         branch_name: str,
@@ -4573,6 +4565,40 @@ class Safekeeper(LogUtils):
         wait_until(20, 0.5, paused)
 
 
+class NeonBroker(LogUtils):
+    """An object managing storage_broker instance"""
+
+    def __init__(self, env: NeonEnv):
+        super().__init__(logfile=env.repo_dir / "storage_broker.log")
+        self.env = env
+        self.port: int = self.env.port_distributor.get_port()
+        self.running = False
+
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+    ):
+        assert not self.running
+        self.env.neon_cli.broker_start(timeout_in_seconds)
+        self.running = True
+        return self
+
+    def stop(self):
+        if self.running:
+            self.env.neon_cli.broker_stop()
+            self.running = False
+        return self
+
+    def listen_addr(self):
+        return f"127.0.0.1:{self.port}"
+
+    def client_url(self):
+        return f"http://{self.listen_addr()}"
+
+    def assert_no_errors(self):
+        assert_no_errors(self.logfile, "storage_controller", [])
+
+
 # TODO: Replace with `StrEnum` when we upgrade to python 3.11
 class NodeKind(str, Enum):
     PAGESERVER = "pageserver"
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index ba170cfb4c..b65430ff49 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,6 +134,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.broker_stop()
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
     env.pageserver.running = False
@@ -176,6 +177,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
 
     # Stop this to get out of the way of the following `start`
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.broker_stop()
 
     # Default start
     res = env.neon_cli.raw_cli(["start"])
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index ebf58d2bd1..c923713432 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -134,7 +134,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     )
 
     env = neon_env_builder.init_configs()
-    env.broker.try_start()
+    env.broker.start()
     for sk in env.safekeepers:
         sk.start()
     env.storage_controller.start()
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2d72dbb2df..dc90a6e9a0 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -69,7 +69,7 @@ def test_storage_controller_smoke(
     env = neon_env_builder.init_configs()
 
     # Start services by hand so that we can skip a pageserver (this will start + register later)
-    env.broker.try_start()
+    env.broker.start()
     env.storage_controller.start()
     env.pageservers[0].start()
     env.pageservers[1].start()
@@ -292,7 +292,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
 
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
-    env.broker.try_start()
+    env.broker.start()
     env.storage_controller.start()
 
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
@@ -2126,7 +2126,7 @@ def start_env(env: NeonEnv, storage_controller_port: int):
         max_workers=2 + len(env.pageservers) + len(env.safekeepers)
     ) as executor:
         futs.append(
-            executor.submit(lambda: env.broker.try_start() or None)
+            executor.submit(lambda: env.broker.start() or None)
         )  # The `or None` is for the linter
 
         for pageserver in env.pageservers:
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 50fac441c0..4bf8cfe88f 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -19,7 +19,6 @@ import psycopg2.errors
 import psycopg2.extras
 import pytest
 import requests
-from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
@@ -1439,11 +1438,7 @@ class SafekeeperEnv:
     ):
         self.repo_dir = repo_dir
         self.port_distributor = port_distributor
-        self.broker = NeonBroker(
-            logfile=Path(self.repo_dir) / "storage_broker.log",
-            port=self.port_distributor.get_port(),
-            neon_binpath=neon_binpath,
-        )
+        self.fake_broker_endpoint = f"http://127.0.0.1:{port_distributor.get_port()}"
         self.pg_bin = pg_bin
         self.num_safekeepers = num_safekeepers
         self.bin_safekeeper = str(neon_binpath / "safekeeper")
@@ -1492,7 +1487,7 @@ class SafekeeperEnv:
             "--id",
             str(i),
             "--broker-endpoint",
-            self.broker.client_url(),
+            self.fake_broker_endpoint,
         ]
         log.info(f'Running command "{" ".join(cmd)}"')
 

From 3454ef7507000f05707418324dad392b366f8de3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 13:16:51 +0300
Subject: [PATCH 074/142] Refactor ImageLayerWriter to avoid passing a Timeline
 to finish() (#9028)

Commit ca5390a89d made a similar change to DeltaLayerWriter.

We bumped into this with Stas with our hackathon project, to create a
standalong program to create image layers directly from a Postgres data
directory. It needs to create image layers without having a Timeline and
other pageserver machinery.

This downgrades the "created image layer {}" message from INFO to TRACE
level. TRACE is used for the corresponding message on delta layer
creation too. The path logged in the message is now the temporary path,
before the file is renamed to its final name. Again commit ca5390a89d
made the same change for the message on delta layer creation.
---
 .../src/tenant/storage_layer/image_layer.rs   | 43 ++++++++-----------
 .../src/tenant/storage_layer/split_writer.rs  | 16 +++----
 pageserver/src/tenant/timeline.rs             |  9 ++--
 pageserver/src/tenant/timeline/compaction.rs  |  6 ++-
 4 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 875e223c9c..5de2582ab7 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,7 +38,7 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -58,7 +58,6 @@ use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -70,9 +69,7 @@ use utils::{
 };
 
 use super::layer_name::ImageLayerName;
-use super::{
-    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -800,10 +797,9 @@ impl ImageLayerWriterInner {
     ///
     async fn finish(
         self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Option<Key>,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -879,12 +875,9 @@ impl ImageLayerWriterInner {
         // fsync the file
         file.sync_all().await?;
 
-        // FIXME: why not carry the virtualfile here, it supports renaming?
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created image layer {}", self.path);
 
-        info!("created image layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
     }
 }
 
@@ -963,24 +956,18 @@ impl ImageLayerWriter {
     ///
     pub(crate) async fn finish(
         mut self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, None).await
     }
 
     /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
     pub(super) async fn finish_with_end_key(
         mut self,
-        timeline: &Arc<Timeline>,
         end_key: Key,
         ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, Some(end_key)).await
     }
 }
 
@@ -1084,7 +1071,7 @@ mod test {
         tenant::{
             config::TenantConf,
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::ResidentLayer,
+            storage_layer::{Layer, ResidentLayer},
             vectored_blob_io::StreamingVectoredReadPlanner,
             Tenant, Timeline,
         },
@@ -1155,7 +1142,8 @@ mod test {
 
                 key = key.next();
             }
-            writer.finish(&timeline, &ctx).await.unwrap()
+            let (desc, path) = writer.finish(&ctx).await.unwrap();
+            Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
         };
         let original_size = resident.metadata().file_size;
 
@@ -1217,7 +1205,9 @@ mod test {
                 .await
                 .unwrap();
             let replacement = if wrote_keys > 0 {
-                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+                let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
+                let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
+                Some(resident)
             } else {
                 None
             };
@@ -1290,7 +1280,8 @@ mod test {
         for (key, img) in images {
             writer.put_image(key, img, ctx).await?;
         }
-        let img_layer = writer.finish(tline, ctx).await?;
+        let (desc, path) = writer.finish(ctx).await?;
+        let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
 
         Ok::<_, anyhow::Error>(img_layer)
     }
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 40a6a77a50..b499a0eef4 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
                 self.generated_layers
                     .push(SplitWriterResult::Discarded(layer_key));
             } else {
-                self.generated_layers.push(SplitWriterResult::Produced(
-                    prev_image_writer
-                        .finish_with_end_key(tline, key, ctx)
-                        .await?,
-                ));
+                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
+
+                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                self.generated_layers
+                    .push(SplitWriterResult::Produced(layer));
             }
         }
         self.inner.put_image(key, img, ctx).await
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
         if discard(&layer_key).await {
             generated_layers.push(SplitWriterResult::Discarded(layer_key));
         } else {
-            generated_layers.push(SplitWriterResult::Produced(
-                inner.finish_with_end_key(tline, end_key, ctx).await?,
-            ));
+            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
+            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(layer));
         }
         Ok(generated_layers)
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 262dccac7d..f66491d962 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4013,7 +4013,8 @@ impl Timeline {
         if wrote_keys {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -4101,7 +4102,8 @@ impl Timeline {
         if wrote_any_image {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -5403,7 +5405,8 @@ impl Timeline {
         for (key, img) in images {
             image_layer_writer.put_image(key, img, ctx).await?;
         }
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        let (desc, path) = image_layer_writer.finish(ctx).await?;
+        let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
 
         {
             let mut guard = self.layers.write().await;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 0b5c520ba7..d1f06e3480 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -563,10 +563,12 @@ impl Timeline {
                 .await?;
 
             if keys_written > 0 {
-                let new_layer = image_layer_writer
-                    .finish(self, ctx)
+                let (desc, path) = image_layer_writer
+                    .finish(ctx)
                     .await
                     .map_err(CompactionError::Other)?;
+                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .map_err(CompactionError::Other)?;
                 tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                     layer.metadata().file_size,
                     new_layer.metadata().file_size);

From c5cd8577ff6d96e8153dd22af17373c4351e52e4 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 18 Sep 2024 13:58:51 +0200
Subject: [PATCH 075/142] proxy: make sql-over-http max request/response sizes
 configurable (#9029)

---
 proxy/src/bin/local_proxy.rs          |   8 ++
 proxy/src/bin/proxy.rs                |   8 ++
 proxy/src/config.rs                   |   2 +
 proxy/src/serverless/conn_pool.rs     |   2 +
 proxy/src/serverless/sql_over_http.rs | 114 +++++++++++++++-----------
 5 files changed, 85 insertions(+), 49 deletions(-)

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 6eba71df1b..94365ddf05 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -92,6 +92,12 @@ struct SqlOverHttpArgs {
 
     #[clap(long, default_value_t = 16)]
     sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: u64,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
 }
 
 #[tokio::main]
@@ -208,6 +214,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
         },
         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     };
 
     Ok(Box::leak(Box::new(ProxyConfig {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ca9aeb04d8..e5c5b47795 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -268,6 +268,12 @@ struct SqlOverHttpArgs {
 
     #[clap(long, default_value_t = 64)]
     sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: u64,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
 }
 
 #[tokio::main]
@@ -679,6 +685,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         },
         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     };
     let authentication_config = AuthenticationConfig {
         thread_pool,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1cda6d200c..373e4cf650 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -56,6 +56,8 @@ pub struct HttpConfig {
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
+    pub max_request_size_bytes: u64,
+    pub max_response_size_bytes: usize,
 }
 
 pub struct AuthenticationConfig {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index bea599e9b9..6c32d5df0e 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -776,6 +776,8 @@ mod tests {
             },
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
+            max_request_size_bytes: u64::MAX,
+            max_response_size_bytes: usize::MAX,
         }));
         let pool = GlobalConnPool::new(config);
         let conn_info = ConnInfo {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 2188edc8c5..06e540d149 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -87,9 +87,6 @@ enum Payload {
     Batch(BatchQueryData),
 }
 
-const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
-const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-
 static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -366,10 +363,10 @@ pub(crate) enum SqlOverHttpError {
     ConnectCompute(#[from] HttpConnError),
     #[error("{0}")]
     ConnInfo(#[from] ConnInfoError),
-    #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
-    RequestTooLarge,
-    #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
-    ResponseTooLarge,
+    #[error("request is too large (max is {0} bytes)")]
+    RequestTooLarge(u64),
+    #[error("response is too large (max is {0} bytes)")]
+    ResponseTooLarge(usize),
     #[error("invalid isolation level")]
     InvalidIsolationLevel,
     #[error("{0}")]
@@ -386,8 +383,8 @@ impl ReportableError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
             SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
             SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
-            SqlOverHttpError::RequestTooLarge => ErrorKind::User,
-            SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
+            SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User,
+            SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
             SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
             SqlOverHttpError::Postgres(p) => p.get_error_kind(),
             SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
@@ -402,8 +399,8 @@ impl UserFacingError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(p) => p.to_string(),
             SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
             SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
-            SqlOverHttpError::RequestTooLarge => self.to_string(),
-            SqlOverHttpError::ResponseTooLarge => self.to_string(),
+            SqlOverHttpError::RequestTooLarge(_) => self.to_string(),
+            SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
             SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
             SqlOverHttpError::Postgres(p) => p.to_string(),
             SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
@@ -537,7 +534,7 @@ async fn handle_inner(
 
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
-        None => MAX_REQUEST_SIZE + 1,
+        None => config.http_config.max_request_size_bytes + 1,
     };
     info!(request_content_length, "request size in bytes");
     Metrics::get()
@@ -547,8 +544,10 @@ async fn handle_inner(
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
-    if request_content_length > MAX_REQUEST_SIZE {
-        return Err(SqlOverHttpError::RequestTooLarge);
+    if request_content_length > config.http_config.max_request_size_bytes {
+        return Err(SqlOverHttpError::RequestTooLarge(
+            config.http_config.max_request_size_bytes,
+        ));
     }
 
     let fetch_and_process_request = Box::pin(
@@ -612,7 +611,10 @@ async fn handle_inner(
 
     // Now execute the query and return the result.
     let json_output = match payload {
-        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
+        Payload::Single(stmt) => {
+            stmt.process(config, cancel, &mut client, parsed_headers)
+                .await?
+        }
         Payload::Batch(statements) => {
             if parsed_headers.txn_read_only {
                 response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
@@ -628,7 +630,7 @@ async fn handle_inner(
             }
 
             statements
-                .process(cancel, &mut client, parsed_headers)
+                .process(config, cancel, &mut client, parsed_headers)
                 .await?
         }
     };
@@ -656,6 +658,7 @@ async fn handle_inner(
 impl QueryData {
     async fn process(
         self,
+        config: &'static ProxyConfig,
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
@@ -664,7 +667,7 @@ impl QueryData {
         let cancel_token = inner.cancel_token();
 
         let res = match select(
-            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
+            pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)),
             pin!(cancel.cancelled()),
         )
         .await
@@ -727,6 +730,7 @@ impl QueryData {
 impl BatchQueryData {
     async fn process(
         self,
+        config: &'static ProxyConfig,
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
@@ -751,44 +755,52 @@ impl BatchQueryData {
             discard.discard();
         })?;
 
-        let json_output =
-            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
-                Ok(json_output) => {
-                    info!("commit");
-                    let status = transaction.commit().await.inspect_err(|_| {
-                        // if we cannot commit - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                    })?;
-                    discard.check_idle(status);
-                    json_output
-                }
-                Err(SqlOverHttpError::Cancelled(_)) => {
-                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                        tracing::error!(?err, "could not cancel query");
-                    }
-                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+        let json_output = match query_batch(
+            config,
+            cancel.child_token(),
+            &transaction,
+            self,
+            parsed_headers,
+        )
+        .await
+        {
+            Ok(json_output) => {
+                info!("commit");
+                let status = transaction.commit().await.inspect_err(|_| {
+                    // if we cannot commit - for now don't return connection to pool
+                    // TODO: get a query status from the error
                     discard.discard();
+                })?;
+                discard.check_idle(status);
+                json_output
+            }
+            Err(SqlOverHttpError::Cancelled(_)) => {
+                if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                    tracing::error!(?err, "could not cancel query");
+                }
+                // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+                discard.discard();
 
-                    return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-                }
-                Err(err) => {
-                    info!("rollback");
-                    let status = transaction.rollback().await.inspect_err(|_| {
-                        // if we cannot rollback - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                    })?;
-                    discard.check_idle(status);
-                    return Err(err);
-                }
-            };
+                return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+            }
+            Err(err) => {
+                info!("rollback");
+                let status = transaction.rollback().await.inspect_err(|_| {
+                    // if we cannot rollback - for now don't return connection to pool
+                    // TODO: get a query status from the error
+                    discard.discard();
+                })?;
+                discard.check_idle(status);
+                return Err(err);
+            }
+        };
 
         Ok(json_output)
     }
 }
 
 async fn query_batch(
+    config: &'static ProxyConfig,
     cancel: CancellationToken,
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
@@ -798,6 +810,7 @@ async fn query_batch(
     let mut current_size = 0;
     for stmt in queries.queries {
         let query = pin!(query_to_json(
+            config,
             transaction,
             stmt,
             &mut current_size,
@@ -826,6 +839,7 @@ async fn query_batch(
 }
 
 async fn query_to_json<T: GenericClient>(
+    config: &'static ProxyConfig,
     client: &T,
     data: QueryData,
     current_size: &mut usize,
@@ -846,8 +860,10 @@ async fn query_to_json<T: GenericClient>(
         rows.push(row);
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
-        if *current_size > MAX_RESPONSE_SIZE {
-            return Err(SqlOverHttpError::ResponseTooLarge);
+        if *current_size > config.http_config.max_response_size_bytes {
+            return Err(SqlOverHttpError::ResponseTooLarge(
+                config.http_config.max_response_size_bytes,
+            ));
         }
     }
 

From e161a2fa424e392c74a5f76e8964227e11f72e9c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 18 Sep 2024 14:26:47 +0100
Subject: [PATCH 076/142] CI(deploy): fix deploy to staging and prod (#9030)

## Problem

It turns out the previous approach (with `skip_if` input) doesn't work
(from https://github.com/neondatabase/neon/pull/9017).
Revert it and use more straightforward if-conditions

## Summary of changes
- Revert efbe8db7f1bd7775f7dff276ee95fd187cd33c2c
- Add if-condition to`promote-compatibility-data` job and relevant
comments
---
 .github/workflows/_push-to-acr.yml   |  6 ------
 .github/workflows/build_and_test.yml | 10 ++++++----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
index 7b6eba2c06..c304172ff7 100644
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -26,15 +26,9 @@ on:
         description: Azure tenant ID
         required: true
         type: string
-      skip_if:
-        description: Skip the job if this expression is true
-        required: true
-        type: boolean
 
 jobs:
   push-to-acr:
-    if: ${{ !inputs.skip_if }}
-
     runs-on: ubuntu-22.04
     permissions:
       contents: read  # This is required for actions/checkout
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d46b8dc1f5..a210c962cb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -849,6 +849,7 @@ jobs:
           done
 
   push-to-acr-dev:
+    if: github.ref_name == 'main'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -858,9 +859,9 @@ jobs:
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
-      skip_if: ${{ github.ref_name != 'main' }}
 
   push-to-acr-prod:
+    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -870,7 +871,6 @@ jobs:
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
-      skip_if: ${{ !startsWith(github.ref_name, 'release') }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
@@ -948,7 +948,8 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1046,7 +1047,8 @@ jobs:
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]
-    if: github.ref_name == 'release'
+    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
+    if: github.ref_name == 'release' && !failure() && !cancelled()
 
     runs-on: ubuntu-22.04
     steps:

From 2f37f0384c1d122fe23512be3b9518288f69b5fe Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 17 Sep 2024 12:28:29 -0500
Subject: [PATCH 077/142] Add v17 to revisions.json

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/revisions.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3a65a507f3..c2c34962bb 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,8 @@
 {
+  "v17": [
+    "17rc1",
+    "9156d63ce253bed9d1f76355ceec610e444eaffa"
+  ],
   "v16": [
     "16.4",
     "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"

From ac6a1151aeaf8fcc007b833a97dca57328c44d05 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 19:08:47 +0100
Subject: [PATCH 078/142] test_postgres_version: reenable version check for
 prereleased versions

---
 test_runner/regress/test_postgres_version.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py
index 4145a303c6..d8626c15a5 100644
--- a/test_runner/regress/test_postgres_version.py
+++ b/test_runner/regress/test_postgres_version.py
@@ -30,9 +30,8 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
     version = match.group("version")
     commit = match.group("commit")
 
-    if "." in version:
-        assert (
-            pg_version.v_prefixed in expected_revisions
-        ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
-        msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
-        assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
+    assert (
+        pg_version.v_prefixed in expected_revisions
+    ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
+    msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
+    assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg

From 794bd4b866a385a104e257deae438581b0fd250d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 18 Sep 2024 17:14:53 +0200
Subject: [PATCH 079/142] proxy: mock cplane usable without allowed-ips table
 (#9046)

---
 proxy/src/bin/proxy.rs             | 21 ++++++++-------
 proxy/src/console/provider/mock.rs | 42 ++++++++++++++++++------------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index e5c5b47795..2ac66ffe8c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -62,12 +62,13 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
     Console,
-    #[cfg(feature = "testing")]
-    Postgres,
     // clap only shows the name, not the alias, in usage text.
     // TODO: swap name/alias and deprecate "link"
     #[value(name("link"), alias("web"))]
     Web,
+
+    #[cfg(feature = "testing")]
+    Postgres,
 }
 
 /// Neon proxy/router
@@ -639,17 +640,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let api = console::provider::ConsoleBackend::Console(api);
             auth::Backend::Console(MaybeOwned::Owned(api), ())
         }
-        #[cfg(feature = "testing")]
-        AuthBackendType::Postgres => {
-            let url = args.auth_endpoint.parse()?;
-            let api = console::provider::mock::Api::new(url);
-            let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::Backend::Console(MaybeOwned::Owned(api), ())
-        }
+
         AuthBackendType::Web => {
             let url = args.uri.parse()?;
             auth::Backend::Web(MaybeOwned::Owned(url), ())
         }
+
+        #[cfg(feature = "testing")]
+        AuthBackendType::Postgres => {
+            let url = args.auth_endpoint.parse()?;
+            let api = console::provider::mock::Api::new(url, !args.is_private_access_proxy);
+            let api = console::provider::ConsoleBackend::Postgres(api);
+            auth::Backend::Console(MaybeOwned::Owned(api), ())
+        }
     };
 
     let config::ConcurrencyLockOptions {
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 08b87cd87a..1b77418de6 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -41,11 +41,15 @@ impl From<tokio_postgres::Error> for ApiError {
 #[derive(Clone)]
 pub struct Api {
     endpoint: ApiUrl,
+    ip_allowlist_check_enabled: bool,
 }
 
 impl Api {
-    pub fn new(endpoint: ApiUrl) -> Self {
-        Self { endpoint }
+    pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
+        Self {
+            endpoint,
+            ip_allowlist_check_enabled,
+        }
     }
 
     pub(crate) fn url(&self) -> &str {
@@ -64,6 +68,7 @@ impl Api {
                 tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
             tokio::spawn(connection);
+
             let secret = if let Some(entry) = get_execute_postgres_query(
                 &client,
                 "select rolpassword from pg_catalog.pg_authid where rolname = $1",
@@ -79,21 +84,26 @@ impl Api {
                 warn!("user '{}' does not exist", user_info.user);
                 None
             };
-            let allowed_ips = match get_execute_postgres_query(
-                &client,
-                "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
-                &[&user_info.endpoint.as_str()],
-                "allowed_ips",
-            )
-            .await?
-            {
-                Some(s) => {
-                    info!("got allowed_ips: {s}");
-                    s.split(',')
-                        .map(|s| IpPattern::from_str(s).unwrap())
-                        .collect()
+
+            let allowed_ips = if self.ip_allowlist_check_enabled {
+                match get_execute_postgres_query(
+                    &client,
+                    "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
+                    &[&user_info.endpoint.as_str()],
+                    "allowed_ips",
+                )
+                .await?
+                {
+                    Some(s) => {
+                        info!("got allowed_ips: {s}");
+                        s.split(',')
+                            .map(|s| IpPattern::from_str(s).unwrap())
+                            .collect()
+                    }
+                    None => vec![],
                 }
-                None => vec![],
+            } else {
+                vec![]
             };
 
             Ok((secret, allowed_ips))

From 035a49a6b2e9ffa35dbf94d78042c147e83e896f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 18 Sep 2024 18:17:55 +0200
Subject: [PATCH 080/142] `neon_local start`: parallel startup to break cyclic
 dependency (#8950)

(Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.)

Problem
-------

Before this PR, `neon_local` sequentially does the following:
1. launch storcon process
2. wait for storcon to signal readiness
[here](https://github.com/neondatabase/neon/blob/75310fe441b87d399213e365f1364aa9f08aa40d/control_plane/src/storage_controller.rs#L804-L808)
3. start pageserver
4. wait for pageserver to become ready
[here](https://github.com/neondatabase/neon/blob/c43e664ff577d4568722e4e7a2b2c6267b609607/control_plane/src/pageserver.rs#L343-L346)
5. etc

The problem is that storcon's readiness waits for the
[`startup_reconcile`](https://github.com/neondatabase/neon/blob/cbcd4058edb7a2c2bb3bfe1a6fc1ffb0d820b870/storage_controller/src/service.rs#L520-L523)
to complete.

But pageservers aren't started at this point.

So, worst case we wait for `STARTUP_RECONCILE_TIMEOUT/2`, i.e., 15s.

This is more than the 10s default timeout allowed by neon_local.

So, the result is that `neon_local start` fails to start storcon and
stops everything.

Solution
--------

In this PR I choose the the radical solution to start everything in
parallel.

It junks up the output because we do stuff like `print!(".")` to
indicate progress.
We should just abandon that.
And switch to `utils::logging` + `tracing` with separate spans for each
component.
I can do that in this PR or we leave it as a follow-up.

Alternatives Considered
-----------------------

The Pageserver's `/v1/status` or in fact any endpoint of the mgmt API
will not `accept()` on the mgmt API socket until after the `re-attach`
call to storcon returned success.

So, it's insufficient to change the startup order to start Pageservers
first.

We cannot easily change Pageserver startup order because
`init_tenant_mgr` must complete before we start serving the mgmt API.
Otherwise tenant detach calls et al can race with `init_tenant_mgr`.

We'd have to add a "loading" state to tenant mgr and make all API
endpoints except `/v1/status` wait for _that_ to complete.


Related
-------

- https://github.com/neondatabase/neon/pull/6475
---
 control_plane/src/background_process.rs |   2 +-
 control_plane/src/bin/neon_local.rs     | 145 ++++++++++++++++--------
 control_plane/src/endpoint.rs           |   2 +-
 3 files changed, 99 insertions(+), 50 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 619c5bce3e..94a072e394 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -151,7 +151,7 @@ where
                     print!(".");
                     io::stdout().flush().unwrap();
                 }
-                thread::sleep(RETRY_INTERVAL);
+                tokio::time::sleep(RETRY_INTERVAL).await;
             }
             Err(e) => {
                 println!("error starting process {process_name:?}: {e:#}");
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1c94c42865..92f609761a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -34,12 +34,14 @@ use safekeeper_api::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
+use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
+use tokio::task::JoinSet;
 use url::Host;
 use utils::{
     auth::{Claims, Scope},
@@ -87,35 +89,35 @@ fn main() -> Result<()> {
 
     // Check for 'neon init' command first.
     let subcommand_result = if sub_name == "init" {
-        handle_init(sub_args).map(Some)
+        handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
     } else {
         // all other commands need an existing config
-        let mut env =
-            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
-        let original_env = env.clone();
 
+        let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
+        let original_env = env.clone();
+        let env = Box::leak(Box::new(env));
         let rt = tokio::runtime::Builder::new_current_thread()
             .enable_all()
             .build()
             .unwrap();
 
         let subcommand_result = match sub_name {
-            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
-            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
-            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
-            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
-            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, &env)),
-            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
-            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
-            "mappings" => handle_mappings(sub_args, &mut env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, env)),
+            "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
+            "stop" => rt.block_on(handle_stop_all(sub_args, env)),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
+            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
+            "mappings" => handle_mappings(sub_args, env),
             "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
             _ => bail!("unexpected subcommand {sub_name}"),
         };
 
-        if original_env != env {
-            subcommand_result.map(|()| Some(env))
+        if &original_env != env {
+            subcommand_result.map(|()| Some(Cow::Borrowed(env)))
         } else {
             subcommand_result.map(|()| None)
         }
@@ -1273,48 +1275,95 @@ async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv
 }
 
 async fn handle_start_all(
-    env: &local_env::LocalEnv,
+    env: &'static local_env::LocalEnv,
     retry_timeout: &Duration,
 ) -> anyhow::Result<()> {
+    let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
+        neon_start_status_check(env, retry_timeout)
+            .await
+            .context("status check after successful startup of all services")?;
+        return Ok(());
+    };
+
+    eprintln!("startup failed because one or more services could not be started");
+
+    for e in errors {
+        eprintln!("{e}");
+        let debug_repr = format!("{e:?}");
+        for line in debug_repr.lines() {
+            eprintln!("  {line}");
+        }
+    }
+
+    try_stop_all(env, true).await;
+
+    exit(2);
+}
+
+/// Returns Ok() if and only if all services could be started successfully.
+/// Otherwise, returns the list of errors that occurred during startup.
+async fn handle_start_all_impl(
+    env: &'static local_env::LocalEnv,
+    retry_timeout: Duration,
+) -> Result<(), Vec<anyhow::Error>> {
     // Endpoints are not started automatically
 
-    broker::start_broker_process(env, retry_timeout).await?;
+    let mut js = JoinSet::new();
 
-    // Only start the storage controller if the pageserver is configured to need it
-    if env.control_plane_api.is_some() {
-        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller
-            .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                (*retry_timeout).into(),
-            ))
-            .await
-        {
-            eprintln!("storage_controller start failed: {:#}", e);
-            try_stop_all(env, true).await;
-            exit(1);
+    // force infalliblity through closure
+    #[allow(clippy::redundant_closure_call)]
+    (|| {
+        js.spawn(async move {
+            let retry_timeout = retry_timeout;
+            broker::start_broker_process(env, &retry_timeout).await
+        });
+
+        // Only start the storage controller if the pageserver is configured to need it
+        if env.control_plane_api.is_some() {
+            js.spawn(async move {
+                let storage_controller = StorageController::from_env(env);
+                storage_controller
+                    .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                        retry_timeout.into(),
+                    ))
+                    .await
+                    .map_err(|e| e.context("start storage_controller"))
+            });
+        }
+
+        for ps_conf in &env.pageservers {
+            js.spawn(async move {
+                let pageserver = PageServerNode::from_env(env, ps_conf);
+                pageserver
+                    .start(&retry_timeout)
+                    .await
+                    .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
+            });
+        }
+
+        for node in env.safekeepers.iter() {
+            js.spawn(async move {
+                let safekeeper = SafekeeperNode::from_env(env, node);
+                safekeeper
+                    .start(vec![], &retry_timeout)
+                    .await
+                    .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
+            });
+        }
+    })();
+
+    let mut errors = Vec::new();
+    while let Some(result) = js.join_next().await {
+        let result = result.expect("we don't panic or cancel the tasks");
+        if let Err(e) = result {
+            errors.push(e);
         }
     }
 
-    for ps_conf in &env.pageservers {
-        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(retry_timeout).await {
-            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true).await;
-            exit(1);
-        }
+    if !errors.is_empty() {
+        return Err(errors);
     }
 
-    for node in env.safekeepers.iter() {
-        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
-            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false).await;
-            exit(1);
-        }
-    }
-
-    neon_start_status_check(env, retry_timeout).await?;
-
     Ok(())
 }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 9f879c4b08..7554a03a68 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -702,7 +702,7 @@ impl Endpoint {
                     }
                 }
             }
-            std::thread::sleep(ATTEMPT_INTERVAL);
+            tokio::time::sleep(ATTEMPT_INTERVAL).await;
         }
 
         // disarm the scopeguard, let the child outlive this function (and neon_local invoction)

From 9d1c6f23d312f43ecd0d7134d7566020626548b2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 18 Sep 2024 14:13:57 -0400
Subject: [PATCH 081/142] fix(storage-scrubber): log version after initialize
 the logger (#9049)

When I checked the log in Grafana I couldn't find the scrubber version.
Then I realized that it should be logged after the logger gets
initialized.

## Summary of changes

Log after initializing the logger for the scrubber.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/main.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index c5961753c5..ee133e2e58 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -121,8 +121,6 @@ enum Command {
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
 
-    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
-
     let bucket_config = BucketConfig::from_env()?;
 
     let command_log_name = match &cli.command {
@@ -142,6 +140,8 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
+    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
+
     let controller_client = cli.controller_api.map(|controller_api| {
         ControllerClientConfig {
             controller_api,

From 728b79b9ddc9987315e56085b12c5a4c46a7e0f4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 00:12:13 +0300
Subject: [PATCH 082/142] Remove some unnecessary derives

---
 libs/consumption_metrics/src/lib.rs    | 4 ++--
 libs/pageserver_api/src/models.rs      | 2 +-
 libs/postgres_ffi/wal_craft/src/lib.rs | 1 -
 safekeeper/src/pull_timeline.rs        | 4 ++--
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 810196aff6..fbe2e6830f 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 
-#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
     #[serde(rename = "absolute")]
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
 
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize, Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
     pub events: std::borrow::Cow<'a, [T]>,
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 45e84baa1f..c9be53f0b0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -495,7 +495,7 @@ pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
     #[serde(rename_all = "snake_case")]
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 949e3f4251..ddaafe65f1 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -26,7 +26,6 @@ macro_rules! xlog_utils_test {
 
 postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
 
-#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
     pub pg_version: u32,
     pub pg_distrib_dir: PathBuf,
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 64585f5edc..c772ae6de7 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -278,7 +278,7 @@ impl WalResidentTimeline {
 }
 
 /// pull_timeline request body.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct Request {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
@@ -293,7 +293,7 @@ pub struct Response {
 }
 
 /// Response for debug dump request.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct DebugDumpResponse {
     pub start_time: DateTime<Utc>,
     pub finish_time: DateTime<Utc>,

From 15ae1fc3df41ba9f1e5093eaf6b8e7b91e32700f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:38:26 +0300
Subject: [PATCH 083/142] Remove a few postgres constants that were not used

Dead code is generally useless, but with Postgres constants in
particular, I'm also worried that if they're not used anywhere, we
might fail to update them at a Postgres version update, and get very
confused later when they have wrong values.
---
 libs/postgres_ffi/src/pg_constants.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 61b49a634d..497d011d7a 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -9,8 +9,8 @@
 //! comments on them.
 //!
 
+use crate::PageHeaderData;
 use crate::BLCKSZ;
-use crate::{PageHeaderData, XLogRecord};
 
 //
 // From pg_tablespace_d.h
@@ -194,8 +194,6 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;
 
-pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
-
 //
 // from xlogrecord.h
 //
@@ -219,8 +217,6 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
 /* From transam.h */
 pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
 pub const INVALID_TRANSACTION_ID: u32 = 0;
-pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
-pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
 
 /* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;

From 7b34c2d7af5c67f413cc93b16ca6dbe6932072f2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:39:28 +0300
Subject: [PATCH 084/142] Remove misc dead code in libs/

---
 libs/postgres_backend/src/lib.rs | 10 ----------
 libs/remote_storage/src/lib.rs   |  4 ----
 libs/utils/src/vec_map.rs        | 26 --------------------------
 3 files changed, 40 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 8ea4b93fb1..e274d24585 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -280,16 +280,6 @@ pub struct PostgresBackend<IO> {
 
 pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
 
-pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
-    let mut query_string = query_string.to_vec();
-    if let Some(ch) = query_string.last() {
-        if *ch == 0 {
-            query_string.pop();
-        }
-    }
-    query_string
-}
-
 /// Cast a byte slice to a string slice, dropping null terminator if there's one.
 fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b5b69c9faf..45267ccda9 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -127,10 +127,6 @@ impl RemotePath {
         &self.0
     }
 
-    pub fn extension(&self) -> Option<&str> {
-        self.0.extension()
-    }
-
     pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
         self.0.strip_prefix(&p.0)
     }
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 5f0028bacd..1fe048c6f0 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -120,32 +120,6 @@ impl<K: Ord, V> VecMap<K, V> {
         Ok((None, delta_size))
     }
 
-    /// Split the map into two.
-    ///
-    /// The left map contains everything before `cutoff` (exclusive).
-    /// Right map contains `cutoff` and everything after (inclusive).
-    pub fn split_at(&self, cutoff: &K) -> (Self, Self)
-    where
-        K: Clone,
-        V: Clone,
-    {
-        let split_idx = self
-            .data
-            .binary_search_by_key(&cutoff, extract_key)
-            .unwrap_or_else(std::convert::identity);
-
-        (
-            VecMap {
-                data: self.data[..split_idx].to_vec(),
-                ordering: self.ordering,
-            },
-            VecMap {
-                data: self.data[split_idx..].to_vec(),
-                ordering: self.ordering,
-            },
-        )
-    }
-
     /// Move items from `other` to the end of `self`, leaving `other` empty.
     /// If the `other` ordering is different from `self` ordering
     /// `ExtendOrderingError` error will be returned.

From 5da2340e740c96ac3f0da110c52c62f184a5c92c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:05:19 +0300
Subject: [PATCH 085/142] Remove misc dead code in control_plane/

---
 control_plane/src/pageserver.rs | 43 +--------------------------------
 1 file changed, 1 insertion(+), 42 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 33ca70af96..cae9416af6 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,9 +17,7 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
-};
+use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -324,22 +322,6 @@ impl PageServerNode {
         background_process::stop_process(immediate, "pageserver", &self.pid_file())
     }
 
-    pub async fn page_server_psql_client(
-        &self,
-    ) -> anyhow::Result<(
-        tokio_postgres::Client,
-        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
-    )> {
-        let mut config = self.pg_connection_config.clone();
-        if self.conf.pg_auth_type == AuthType::NeonJWT {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-            config = config.set_password(Some(token));
-        }
-        Ok(config.connect_no_tls().await?)
-    }
-
     pub async fn check_status(&self) -> mgmt_api::Result<()> {
         self.http_client.status().await
     }
@@ -540,19 +522,6 @@ impl PageServerNode {
         Ok(())
     }
 
-    pub async fn location_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-        config: LocationConfig,
-        flush_ms: Option<Duration>,
-        lazy: bool,
-    ) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .location_config(tenant_shard_id, config, flush_ms, lazy)
-            .await?)
-    }
-
     pub async fn timeline_list(
         &self,
         tenant_shard_id: &TenantShardId,
@@ -636,14 +605,4 @@ impl PageServerNode {
 
         Ok(())
     }
-
-    pub async fn tenant_synthetic_size(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> anyhow::Result<TenantHistorySize> {
-        Ok(self
-            .http_client
-            .tenant_synthetic_size(tenant_shard_id)
-            .await?)
-    }
 }

From 2d4e5af18be812523e24133656523bb3e8ee9ecb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:04:57 +0300
Subject: [PATCH 086/142] Remove unused code for parsing a postgresql.conf file

---
 control_plane/src/postgresql_conf.rs | 121 +--------------------------
 1 file changed, 1 insertion(+), 120 deletions(-)

diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs
index 638575eb82..5aee12dc97 100644
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -4,13 +4,10 @@
 /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
 /// enough to extract a few settings we need in Neon, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
-use anyhow::{bail, Context, Result};
 use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashMap;
 use std::fmt;
-use std::io::BufRead;
-use std::str::FromStr;
 
 /// In-memory representation of a postgresql.conf file
 #[derive(Default, Debug)]
@@ -19,84 +16,16 @@ pub struct PostgresConf {
     hash: HashMap<String, String>,
 }
 
-static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
-
 impl PostgresConf {
     pub fn new() -> PostgresConf {
         PostgresConf::default()
     }
 
-    /// Read file into memory
-    pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
-        let mut result = Self::new();
-
-        for line in std::io::BufReader::new(read).lines() {
-            let line = line?;
-
-            // Store each line in a vector, in original format
-            result.lines.push(line.clone());
-
-            // Also parse each line and insert key=value lines into a hash map.
-            //
-            // FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
-            // But it's close enough for our usage.
-            let line = line.trim();
-            if line.starts_with('#') {
-                // comment, ignore
-                continue;
-            } else if let Some(caps) = CONF_LINE_RE.captures(line) {
-                let name = caps.get(1).unwrap().as_str();
-                let raw_val = caps.get(2).unwrap().as_str();
-
-                if let Ok(val) = deescape_str(raw_val) {
-                    // Note: if there's already an entry in the hash map for
-                    // this key, this will replace it. That's the behavior what
-                    // we want; when PostgreSQL reads the file, each line
-                    // overrides any previous value for the same setting.
-                    result.hash.insert(name.to_string(), val.to_string());
-                }
-            }
-        }
-        Ok(result)
-    }
-
     /// Return the current value of 'option'
     pub fn get(&self, option: &str) -> Option<&str> {
         self.hash.get(option).map(|x| x.as_ref())
     }
 
-    /// Return the current value of a field, parsed to the right datatype.
-    ///
-    /// This calls the FromStr::parse() function on the value of the field. If
-    /// the field does not exist, or parsing fails, returns an error.
-    ///
-    pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
-    where
-        T: FromStr,
-        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
-    {
-        self.get(field_name)
-            .with_context(|| format!("could not find '{}' option {}", field_name, context))?
-            .parse::<T>()
-            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
-    }
-
-    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
-    where
-        T: FromStr,
-        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
-    {
-        if let Some(val) = self.get(field_name) {
-            let result = val
-                .parse::<T>()
-                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
-
-            Ok(Some(result))
-        } else {
-            Ok(None)
-        }
-    }
-
     ///
     /// Note: if you call this multiple times for the same option, the config
     /// file will a line for each call. It would be nice to have a function
@@ -154,48 +83,8 @@ fn escape_str(s: &str) -> String {
     }
 }
 
-/// De-escape a possibly-quoted value.
-///
-/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
-/// does this.
-fn deescape_str(s: &str) -> Result<String> {
-    // If the string has a quote at the beginning and end, strip them out.
-    if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
-        let mut result = String::new();
-
-        let mut iter = s[1..(s.len() - 1)].chars().peekable();
-        while let Some(c) = iter.next() {
-            let newc = if c == '\\' {
-                match iter.next() {
-                    Some('b') => '\x08',
-                    Some('f') => '\x0c',
-                    Some('n') => '\n',
-                    Some('r') => '\r',
-                    Some('t') => '\t',
-                    Some('0'..='7') => {
-                        // TODO
-                        bail!("octal escapes not supported");
-                    }
-                    Some(n) => n,
-                    None => break,
-                }
-            } else if c == '\'' && iter.peek() == Some(&'\'') {
-                // doubled quote becomes just one quote
-                iter.next().unwrap()
-            } else {
-                c
-            };
-
-            result.push(newc);
-        }
-        Ok(result)
-    } else {
-        Ok(s.to_string())
-    }
-}
-
 #[test]
-fn test_postgresql_conf_escapes() -> Result<()> {
+fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
     assert_eq!(escape_str("foo bar"), "'foo bar'");
     // these don't need to be quoted
     assert_eq!(escape_str("foo"), "foo");
@@ -214,13 +103,5 @@ fn test_postgresql_conf_escapes() -> Result<()> {
     assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
     assert_eq!(escape_str("10 cats"), "'10 cats'");
 
-    // Test de-escaping
-    assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
-    assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
-    assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
-
-    // octal-escapes are currently not supported
-    assert!(deescape_str("'foo\\7\\07\\007'").is_err());
-
     Ok(())
 }

From a523548ed1791c683efe76cf5a2f42443846e358 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:16:15 +0300
Subject: [PATCH 087/142] Remove unused cleanup_remaining_timeline_fs_traces
 function

There's some more code that still checks for uninit and delete
markers, see callers of is_delete_mark and is_uninit_mark, and github
issue #5718. But these functions were outright dead.
---
 pageserver/src/config.rs                 | 14 +---------
 pageserver/src/tenant/timeline/delete.rs | 35 +-----------------------
 2 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index e9f197ec2d..525d9afebc 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -13,7 +13,6 @@ use pageserver_api::{
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
-use utils::crashsafe::path_with_suffix_extension;
 use utils::logging::SecretString;
 
 use once_cell::sync::OnceCell;
@@ -33,7 +32,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::virtual_file;
 use crate::virtual_file::io_engine;
-use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
+use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
 
 /// Global state of pageserver.
 ///
@@ -257,17 +256,6 @@ impl PageServerConf {
             .join(timeline_id.to_string())
     }
 
-    pub(crate) fn timeline_delete_mark_file_path(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Utf8PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_shard_id, &timeline_id),
-            TIMELINE_DELETE_MARK_SUFFIX,
-        )
-    }
-
     /// Turns storage remote path of a file into its local path.
     pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
         remote_path.with_base(&self.workdir)
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index dc4118bb4a..90db08ea81 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -135,25 +135,6 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
         .context("delete_all")
 }
 
-// This function removs remaining traces of a timeline on disk.
-// Namely: metadata file, timeline directory, delete mark.
-// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
-// delete mark should be present because it is the last step during deletion.
-// (nothing can fail after its deletion)
-async fn cleanup_remaining_timeline_fs_traces(
-    conf: &PageServerConf,
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<()> {
-    // Remove delete mark
-    // TODO: once we are confident that no more exist in the field, remove this
-    // line.  It cleans up a legacy marker file that might in rare cases be present.
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove delete mark")
-}
-
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
@@ -194,12 +175,10 @@ async fn remove_timeline_from_tenant(
 /// 7. Delete mark file
 ///
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
+/// There are two entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
 ///    and we possibly neeed to continue deletion of remote files.
-/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-///    index but still have local metadata, timeline directory and delete mark.
 ///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
@@ -311,18 +290,6 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
-    #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn cleanup_remaining_timeline_fs_traces(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
-        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
-                .await;
-        info!("Done");
-        r
-    }
-
     fn prepare(
         tenant: &Tenant,
         timeline_id: TimelineId,

From 2753abc0d88cad4ac0e9d96f95fddb7515ff7204 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:10:09 +0300
Subject: [PATCH 088/142] Remove leftover enums for configuring vectored get
 implementation

The settings were removed in commit corb9d2c7b.
---
 libs/pageserver_api/src/config.rs | 34 -------------------------------
 1 file changed, 34 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 1194ee93ef..fa6f594ea5 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -173,40 +173,6 @@ impl Default for EvictionOrder {
     }
 }
 
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetVectoredImpl {
-    Sequential,
-    Vectored,
-}
-
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetImpl {
-    Legacy,
-    Vectored,
-}
-
 #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);

From 5c68e6a1724361824e64dc84b9523809a1581e7b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:43:06 +0300
Subject: [PATCH 089/142] Remove unused constant

The code that used it was removed in commit b9d2c7bdd5
---
 libs/pageserver_api/src/config.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index fa6f594ea5..425e710372 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -304,8 +304,6 @@ pub mod defaults {
     pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
         ImageCompressionAlgorithm::Zstd { level: Some(1) };
 
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
-
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;

From 06d55a3b12b551902d9a1484459a2da71082a0ca Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:38:42 +0300
Subject: [PATCH 090/142] Clean up concurrent logical size calc semaphore
 initialization

The DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES constant was
unused, because we had just hardcoded it to 1 where the constant
should've been used.

Remove the ConfigurableSemaphore::Default implementation, since it was
unused.
---
 libs/pageserver_api/src/config.rs |  5 ++++-
 pageserver/src/config.rs          | 11 -----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 425e710372..1eb0757592 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -340,7 +340,10 @@ impl Default for ConfigToml {
 
             concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
                 .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
+            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
+                DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
+            )
+            .unwrap(),
             metric_collection_interval: (humantime::parse_duration(
                 DEFAULT_METRIC_COLLECTION_INTERVAL,
             )
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 525d9afebc..8567c6aa52 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -479,11 +479,6 @@ pub struct ConfigurableSemaphore {
 }
 
 impl ConfigurableSemaphore {
-    pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
-        Some(x) => x,
-        None => panic!("const unwrap is not yet stable"),
-    };
-
     /// Initializse using a non-zero amount of permits.
     ///
     /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
@@ -504,12 +499,6 @@ impl ConfigurableSemaphore {
     }
 }
 
-impl Default for ConfigurableSemaphore {
-    fn default() -> Self {
-        Self::new(Self::DEFAULT_INITIAL)
-    }
-}
-
 impl PartialEq for ConfigurableSemaphore {
     fn eq(&self, other: &Self) -> bool {
         // the number of permits can be increased at runtime, so we cannot really fulfill the

From 7c489092b7c63c0b3e42597b976522face0baaf8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:46:09 +0300
Subject: [PATCH 091/142] Remove unused duplicate DEFAULT_INGEST_BATCH_SIZE
 constant

This constant in 'tenant_conf_defaults' was unused, but there's
another constant with the same name in the global 'defaults'. I wish
the setting was configurable per-tenant, but it isn't, so let's remove
the confusing duplicate.
---
 libs/pageserver_api/src/config.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 1eb0757592..61e32bc9ab 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -434,8 +434,6 @@ pub mod tenant_conf_defaults {
     // By default ingest enough WAL for two new L0 layers before checking if new image
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
 impl Default for TenantConfigToml {

From 32a0e759bd57f40af6c168f3122a761fb154b5b1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 13 Sep 2024 01:28:12 +0300
Subject: [PATCH 092/142] safekeeper: add wal_last_modified to debug_dump.

Adds to debug_dump option to include highest modified time among all WAL
segments. In passing replace some str with OsStr to have less unwraps.
---
 libs/postgres_ffi/src/xlog_utils.rs           | 34 ++++++++++++++-----
 libs/postgres_ffi/wal_craft/src/lib.rs        |  5 +--
 .../wal_craft/src/xlog_utils_test.rs          | 19 ++++++-----
 safekeeper/src/debug_dump.rs                  | 33 ++++++++++++++++++
 safekeeper/src/http/routes.rs                 |  4 +++
 safekeeper/src/wal_storage.rs                 | 25 ++++++--------
 test_runner/regress/test_wal_acceptor.py      |  1 +
 7 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 1873734753..a636bd2a97 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -26,6 +26,7 @@ use bytes::{Buf, Bytes};
 use log::*;
 
 use serde::Serialize;
+use std::ffi::OsStr;
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
@@ -78,19 +79,34 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
     )
 }
 
-pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
-    let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
-    let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
-    let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
-    (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
+pub fn XLogFromFileName(
+    fname: &OsStr,
+    wal_seg_size: usize,
+) -> anyhow::Result<(XLogSegNo, TimeLineID)> {
+    if let Some(fname_str) = fname.to_str() {
+        let tli = u32::from_str_radix(&fname_str[0..8], 16)?;
+        let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo;
+        let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo;
+        Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli))
+    } else {
+        anyhow::bail!("non-ut8 filename: {:?}", fname);
+    }
 }
 
-pub fn IsXLogFileName(fname: &str) -> bool {
-    return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
+pub fn IsXLogFileName(fname: &OsStr) -> bool {
+    if let Some(fname) = fname.to_str() {
+        fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit())
+    } else {
+        false
+    }
 }
 
-pub fn IsPartialXLogFileName(fname: &str) -> bool {
-    fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
+pub fn IsPartialXLogFileName(fname: &OsStr) -> bool {
+    if let Some(fname) = fname.to_str() {
+        fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8]))
+    } else {
+        false
+    }
 }
 
 /// If LSN points to the beginning of the page, then shift it to first record,
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index ddaafe65f1..5c0abda522 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -7,6 +7,7 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{
     XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
+use std::ffi::OsStr;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -135,8 +136,8 @@ impl Conf {
 
     pub fn pg_waldump(
         &self,
-        first_segment_name: &str,
-        last_segment_name: &str,
+        first_segment_name: &OsStr,
+        last_segment_name: &OsStr,
     ) -> anyhow::Result<std::process::Output> {
         let first_segment_file = self.datadir.join(first_segment_name);
         let last_segment_file = self.datadir.join(last_segment_name);
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 79d45de67a..9eb3f0e95a 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -4,6 +4,7 @@ use super::*;
 use crate::{error, info};
 use regex::Regex;
 use std::cmp::min;
+use std::ffi::OsStr;
 use std::fs::{self, File};
 use std::io::Write;
 use std::{env, str::FromStr};
@@ -54,7 +55,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
         .wal_dir()
         .read_dir()
         .unwrap()
-        .map(|f| f.unwrap().file_name().into_string().unwrap())
+        .map(|f| f.unwrap().file_name())
         .filter(|fname| IsXLogFileName(fname))
         .max()
         .unwrap();
@@ -70,11 +71,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
             start_lsn
         );
         for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
-            let fname = file.file_name().into_string().unwrap();
+            let fname = file.file_name();
             if !IsXLogFileName(&fname) {
                 continue;
             }
-            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap();
             let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
             if seg_start_lsn > u64::from(*start_lsn) {
                 continue;
@@ -93,10 +94,10 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
     }
 }
 
-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
+fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
     // Get the actual end of WAL by pg_waldump
     let waldump_output = cfg
-        .pg_waldump("000000010000000000000001", last_segment)
+        .pg_waldump(OsStr::new("000000010000000000000001"), last_segment)
         .unwrap()
         .stderr;
     let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
@@ -117,7 +118,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
 
 fn check_end_of_wal(
     cfg: &crate::Conf,
-    last_segment: &str,
+    last_segment: &OsStr,
     start_lsn: Lsn,
     expected_end_of_wal: Lsn,
 ) {
@@ -132,7 +133,8 @@ fn check_end_of_wal(
     // Rename file to partial to actually find last valid lsn, then rename it back.
     fs::rename(
         cfg.wal_dir().join(last_segment),
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir()
+            .join(format!("{}.partial", last_segment.to_str().unwrap())),
     )
     .unwrap();
     let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
@@ -142,7 +144,8 @@ fn check_end_of_wal(
     );
     assert_eq!(wal_end, expected_end_of_wal);
     fs::rename(
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir()
+            .join(format!("{}.partial", last_segment.to_str().unwrap())),
         cfg.wal_dir().join(last_segment),
     )
     .unwrap();
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index 15b0272cd9..589536c7a8 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -17,6 +17,7 @@ use postgres_ffi::MAX_SEND_SIZE;
 use serde::Deserialize;
 use serde::Serialize;
 
+use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName};
 use sha2::{Digest, Sha256};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
@@ -51,6 +52,9 @@ pub struct Args {
     /// Dump full term history. True by default.
     pub dump_term_history: bool,
 
+    /// Dump last modified time of WAL segments. Uses value of `dump_all` by default.
+    pub dump_wal_last_modified: bool,
+
     /// Filter timelines by tenant_id.
     pub tenant_id: Option<TenantId>,
 
@@ -128,12 +132,19 @@ async fn build_from_tli_dump(
         None
     };
 
+    let wal_last_modified = if args.dump_wal_last_modified {
+        get_wal_last_modified(timeline_dir).ok().flatten()
+    } else {
+        None
+    };
+
     Timeline {
         tenant_id: timeline.ttid.tenant_id,
         timeline_id: timeline.ttid.timeline_id,
         control_file,
         memory,
         disk_content,
+        wal_last_modified,
     }
 }
 
@@ -156,6 +167,7 @@ pub struct Timeline {
     pub control_file: Option<TimelinePersistentState>,
     pub memory: Option<Memory>,
     pub disk_content: Option<DiskContent>,
+    pub wal_last_modified: Option<DateTime<Utc>>,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -302,6 +314,27 @@ fn build_file_info(entry: DirEntry) -> Result<FileInfo> {
     })
 }
 
+/// Get highest modified time of WAL segments in the directory.
+fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
+    let mut res = None;
+    for entry in fs::read_dir(path)? {
+        if entry.is_err() {
+            continue;
+        }
+        let entry = entry?;
+        /* Ignore files that are not XLOG segments */
+        let fname = entry.file_name();
+        if !IsXLogFileName(&fname) && !IsPartialXLogFileName(&fname) {
+            continue;
+        }
+
+        let metadata = entry.metadata()?;
+        let modified: DateTime<Utc> = DateTime::from(metadata.modified()?);
+        res = std::cmp::max(res, Some(modified));
+    }
+    Ok(res)
+}
+
 /// Converts SafeKeeperConf to Config, filtering out the fields that are not
 /// supposed to be exposed.
 fn build_config(config: SafeKeeperConf) -> Config {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index e482edea55..b4590fe3e5 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -481,6 +481,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     let mut dump_memory: Option<bool> = None;
     let mut dump_disk_content: Option<bool> = None;
     let mut dump_term_history: Option<bool> = None;
+    let mut dump_wal_last_modified: Option<bool> = None;
     let mut tenant_id: Option<TenantId> = None;
     let mut timeline_id: Option<TimelineId> = None;
 
@@ -494,6 +495,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
             "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?),
             "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?),
             "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?),
+            "dump_wal_last_modified" => dump_wal_last_modified = Some(parse_kv_str(&k, &v)?),
             "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?),
             "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?),
             _ => Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -508,6 +510,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     let dump_memory = dump_memory.unwrap_or(dump_all);
     let dump_disk_content = dump_disk_content.unwrap_or(dump_all);
     let dump_term_history = dump_term_history.unwrap_or(true);
+    let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
 
     let args = debug_dump::Args {
         dump_all,
@@ -515,6 +518,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
         dump_memory,
         dump_disk_content,
         dump_term_history,
+        dump_wal_last_modified,
         tenant_id,
         timeline_id,
     };
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 46c260901d..6e7da94973 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -539,20 +539,17 @@ async fn remove_segments_from_disk(
     while let Some(entry) = entries.next_entry().await? {
         let entry_path = entry.path();
         let fname = entry_path.file_name().unwrap();
-
-        if let Some(fname_str) = fname.to_str() {
-            /* Ignore files that are not XLOG segments */
-            if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) {
-                continue;
-            }
-            let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
-            if remove_predicate(segno) {
-                remove_file(entry_path).await?;
-                n_removed += 1;
-                min_removed = min(min_removed, segno);
-                max_removed = max(max_removed, segno);
-                REMOVED_WAL_SEGMENTS.inc();
-            }
+        /* Ignore files that are not XLOG segments */
+        if !IsXLogFileName(fname) && !IsPartialXLogFileName(fname) {
+            continue;
+        }
+        let (segno, _) = XLogFromFileName(fname, wal_seg_size)?;
+        if remove_predicate(segno) {
+            remove_file(entry_path).await?;
+            n_removed += 1;
+            min_removed = min(min_removed, segno);
+            max_removed = max(max_removed, segno);
+            REMOVED_WAL_SEGMENTS.inc();
         }
     }
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 4bf8cfe88f..8ee548bdb0 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -892,6 +892,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     log.info(f"debug_dump before reboot {debug_dump_0}")
     assert debug_dump_0["timelines_count"] == 1
     assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id)
+    assert debug_dump_0["timelines"][0]["wal_last_modified"] != ""
 
     endpoint.safe_psql("create table t(i int)")
 

From 21eeafaaa58326bea8411b5b28db58fa0755e47e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 19 Sep 2024 14:51:00 +0100
Subject: [PATCH 093/142] pageserver: simple fix for vectored read image layer
 skip (#9026)

## Problem

Different keyspaces may require different floor LSNs in vectored
delta layer visits. This patch adds support for such cases.

## Summary of changes

Different keyspaces wishing to read the same layer might
require different stop lsns (or lsn floor). The start LSN
of the read (or the lsn ceil) will always be the same.

With this observation, we fix skipping of image layers by
indexing the fringe by layer id plus lsn floor.

This is very simple, but means that we can visit delta layers twice
in certain cases. Still, I think it's very unlikely for any extra
merging to have taken place in this case, so perhaps it makes sense to go
with the simpler patch.

Fixes https://github.com/neondatabase/neon/issues/9012
Alternative to https://github.com/neondatabase/neon/pull/9025
---
 pageserver/src/tenant.rs               | 146 ++++++++++++++++++++++++-
 pageserver/src/tenant/storage_layer.rs |  52 +++++----
 2 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c6f0e48101..14cb6f508d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4164,9 +4164,18 @@ pub(crate) mod harness {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
             if records_neon {
                 // For Neon wal records, we can decode without spawning postgres, so do so.
-                let base_img = base_img.expect("Neon WAL redo requires base image").1;
-                let mut page = BytesMut::new();
-                page.extend_from_slice(&base_img);
+                let mut page = match (base_img, records.first()) {
+                    (Some((_lsn, img)), _) => {
+                        let mut page = BytesMut::new();
+                        page.extend_from_slice(&img);
+                        page
+                    }
+                    (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
+                    _ => {
+                        panic!("Neon WAL redo requires base image or will init record");
+                    }
+                };
+
                 for (record_lsn, record) in records {
                     apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                 }
@@ -8470,4 +8479,135 @@ mod tests {
 
         Ok(())
     }
+
+    // Regression test for https://github.com/neondatabase/neon/issues/9012
+    // Create an image arrangement where we have to read at different LSN ranges
+    // from a delta layer. This is achieved by overlapping an image layer on top of
+    // a delta layer. Like so:
+    //
+    //     A      B
+    // +----------------+ -> delta_layer
+    // |                |                           ^ lsn
+    // |       =========|-> nested_image_layer      |
+    // |       C        |                           |
+    // +----------------+                           |
+    // ======== -> baseline_image_layer             +-------> key
+    //
+    //
+    // When querying the key range [A, B) we need to read at different LSN ranges
+    // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
+    #[tokio::test]
+    async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let mut delta_layer_spec = Vec::default();
+        let delta_layer_start_lsn = Lsn(0x20);
+        let mut delta_layer_end_lsn = delta_layer_start_lsn;
+
+        for i in 0..10 {
+            let key = get_key(i);
+            let key_in_nested = nested_img_layer
+                .iter()
+                .any(|(key_with_img, _)| *key_with_img == key);
+            let lsn = {
+                if key_in_nested {
+                    Lsn(nested_image_layer_lsn.0 + 0x10)
+                } else {
+                    delta_layer_start_lsn
+                }
+            };
+
+            let will_init = will_init_keys.contains(&i);
+            if will_init {
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+
+                expected_key_values.insert(key, "".to_string());
+            } else {
+                let delta = format!("@{lsn}");
+                delta_layer_spec.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+            }
+            delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
+        }
+
+        delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
+
+        assert!(
+            nested_image_layer_lsn > delta_layer_start_lsn
+                && nested_image_layer_lsn < delta_layer_end_lsn
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    delta_layer_start_lsn..delta_layer_end_lsn,
+                    delta_layer_spec,
+                )], // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                delta_layer_end_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value));
+        }
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index dac6b2f893..cd252aa371 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -276,6 +276,16 @@ pub(crate) enum LayerId {
     InMemoryLayerId(InMemoryLayerFileId),
 }
 
+/// Uniquely identify a layer visit by the layer
+/// and LSN floor (or start LSN) of the reads.
+/// The layer itself is not enough since we may
+/// have different LSN lower bounds for delta layer reads.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+struct LayerToVisitId {
+    layer_id: LayerId,
+    lsn_floor: Lsn,
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -287,9 +297,9 @@ pub(crate) enum ReadableLayer {
 
 /// A partial description of a read to be done.
 #[derive(Debug, Clone)]
-struct ReadDesc {
+struct LayerVisit {
     /// An id used to resolve the readable layer within the fringe
-    layer_id: LayerId,
+    layer_to_visit_id: LayerToVisitId,
     /// Lsn range for the read, used for selecting the next read
     lsn_range: Range<Lsn>,
 }
@@ -303,12 +313,12 @@ struct ReadDesc {
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<LayerId, LayerKeyspace>,
+    planned_visits_by_lsn: BinaryHeap<LayerVisit>,
+    visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
 }
 
 #[derive(Debug)]
-struct LayerKeyspace {
+struct LayerVisitReads {
     layer: ReadableLayer,
     target_keyspace: KeySpaceRandomAccum,
 }
@@ -316,23 +326,23 @@ struct LayerKeyspace {
 impl LayerFringe {
     pub(crate) fn new() -> Self {
         LayerFringe {
-            planned_reads_by_lsn: BinaryHeap::new(),
-            layers: HashMap::new(),
+            planned_visits_by_lsn: BinaryHeap::new(),
+            visit_reads: HashMap::new(),
         }
     }
 
     pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_reads_by_lsn.pop() {
+        let read_desc = match self.planned_visits_by_lsn.pop() {
             Some(desc) => desc,
             None => return None,
         };
 
-        let removed = self.layers.remove_entry(&read_desc.layer_id);
+        let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
 
         match removed {
             Some((
                 _,
-                LayerKeyspace {
+                LayerVisitReads {
                     layer,
                     mut target_keyspace,
                 },
@@ -351,20 +361,24 @@ impl LayerFringe {
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
     ) {
-        let layer_id = layer.id();
-        let entry = self.layers.entry(layer_id.clone());
+        let layer_to_visit_id = LayerToVisitId {
+            layer_id: layer.id(),
+            lsn_floor: lsn_range.start,
+        };
+
+        let entry = self.visit_reads.entry(layer_to_visit_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
                 entry.get_mut().target_keyspace.add_keyspace(keyspace);
             }
             Entry::Vacant(entry) => {
-                self.planned_reads_by_lsn.push(ReadDesc {
+                self.planned_visits_by_lsn.push(LayerVisit {
                     lsn_range,
-                    layer_id: layer_id.clone(),
+                    layer_to_visit_id: layer_to_visit_id.clone(),
                 });
                 let mut accum = KeySpaceRandomAccum::new();
                 accum.add_keyspace(keyspace);
-                entry.insert(LayerKeyspace {
+                entry.insert(LayerVisitReads {
                     layer,
                     target_keyspace: accum,
                 });
@@ -379,7 +393,7 @@ impl Default for LayerFringe {
     }
 }
 
-impl Ord for ReadDesc {
+impl Ord for LayerVisit {
     fn cmp(&self, other: &Self) -> Ordering {
         let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
         if ord == std::cmp::Ordering::Equal {
@@ -390,19 +404,19 @@ impl Ord for ReadDesc {
     }
 }
 
-impl PartialOrd for ReadDesc {
+impl PartialOrd for LayerVisit {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl PartialEq for ReadDesc {
+impl PartialEq for LayerVisit {
     fn eq(&self, other: &Self) -> bool {
         self.lsn_range == other.lsn_range
     }
 }
 
-impl Eq for ReadDesc {}
+impl Eq for LayerVisit {}
 
 impl ReadableLayer {
     pub(crate) fn id(&self) -> LayerId {

From ff9f065c4386496193d62f1ff1fadd28cce92910 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Sep 2024 10:43:12 -0400
Subject: [PATCH 094/142] impr(pageserver): log image layer creation (#9050)

https://github.com/neondatabase/neon/pull/9028 changed the image layer
creation log into trace level. However, I personally find logging image
layer creation useful when reading the logs -- it makes it clear that
the image layer creation is happening and gives a clear idea of the
progress. Therefore, I propose to continue logging them for
create_image_layers set of functions.

## Summary of changes

* Add info logging for all image layers created in legacy compaction.
* Add info logging for all layers creation in testing functions.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f66491d962..a06cea2c66 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4015,6 +4015,7 @@ impl Timeline {
             // partition, so flush it to disk.
             let (desc, path) = image_layer_writer.finish(ctx).await?;
             let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!("created image layer for rel {}", image_layer.local_path());
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -4104,6 +4105,10 @@ impl Timeline {
             // partition, so flush it to disk.
             let (desc, path) = image_layer_writer.finish(ctx).await?;
             let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!(
+                "created image layer for metadata {}",
+                image_layer.local_path()
+            );
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -5407,7 +5412,7 @@ impl Timeline {
         }
         let (desc, path) = image_layer_writer.finish(ctx).await?;
         let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-
+        info!("force created image layer {}", image_layer.local_path());
         {
             let mut guard = self.layers.write().await;
             guard.open_mut().unwrap().force_insert_layer(image_layer);
@@ -5486,7 +5491,7 @@ impl Timeline {
         }
         let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
         let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-
+        info!("force created delta layer {}", delta_layer.local_path());
         {
             let mut guard = self.layers.write().await;
             guard.open_mut().unwrap().force_insert_layer(delta_layer);

From 0a1ca7670cbaddb2e83b4e41142b8a7f5fcf0aef Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 19 Sep 2024 16:09:30 +0100
Subject: [PATCH 095/142] proxy: remove auth info from http conn info & fixup
 jwt api trait (#9047)

misc changes split out from #8855

- **allow cloning the request context in a read-only fashion for
background tasks**
- **propagate endpoint and request context through the jwk cache**
- **only allow password based auth for md5 during testing**
- **remove auth info from conn info**
---
 proxy/src/auth/backend.rs             | 15 ++++------
 proxy/src/auth/backend/hacks.rs       | 14 ++++-----
 proxy/src/auth/backend/jwt.rs         | 43 ++++++++++++++++++++++-----
 proxy/src/auth/backend/local.rs       | 10 +++++--
 proxy/src/console/provider.rs         |  1 +
 proxy/src/context.rs                  | 34 +++++++++++++++++++++
 proxy/src/metrics.rs                  | 20 +++++++++++++
 proxy/src/serverless/backend.rs       |  9 +-----
 proxy/src/serverless/conn_pool.rs     |  9 ++++--
 proxy/src/serverless/sql_over_http.rs | 21 ++++++-------
 test_runner/fixtures/neon_fixtures.py |  3 --
 11 files changed, 127 insertions(+), 52 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5561c9c56d..5bc2f2ff65 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -163,6 +163,7 @@ impl ComputeUserInfo {
 }
 
 pub(crate) enum ComputeCredentialKeys {
+    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
     None,
@@ -293,16 +294,10 @@ async fn auth_quirks(
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match user_info.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
-
-            ctx.set_endpoint_id(res.info.endpoint.clone());
-            let password = match res.keys {
-                ComputeCredentialKeys::Password(p) => p,
-                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
-                    unreachable!("password hack should return a password")
-                }
-            };
-            (res.info, Some(password))
+            let (info, password) =
+                hacks::password_hack_no_authentication(ctx, info, client).await?;
+            ctx.set_endpoint_id(info.endpoint.clone());
+            (info, Some(password))
         }
         Ok(info) => (info, None),
     };
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index e9019ce2cf..15123a2623 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,6 +1,4 @@
-use super::{
-    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
-};
+use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
 use crate::{
     auth::{self, AuthFlow},
     config::AuthenticationConfig,
@@ -63,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
     ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<ComputeCredentials> {
+) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
     warn!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -79,12 +77,12 @@ pub(crate) async fn password_hack_no_authentication(
     info!(project = &*payload.endpoint, "received missing parameter");
 
     // Report tentative success; compute node will check the password anyway.
-    Ok(ComputeCredentials {
-        info: ComputeUserInfo {
+    Ok((
+        ComputeUserInfo {
             user: info.user,
             options: info.options,
             endpoint: payload.endpoint,
         },
-        keys: ComputeCredentialKeys::Password(payload.password),
-    })
+        payload.password,
+    ))
 }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 1f44e4af5d..94e5999a5f 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -25,6 +25,8 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
         &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
         role_name: RoleName,
     ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
@@ -101,7 +103,9 @@ impl JwkCacheEntryLock {
     async fn renew_jwks<F: FetchAuthRules>(
         &self,
         _permit: JwkRenewalPermit<'_>,
+        ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         auth_rules: &F,
     ) -> anyhow::Result<Arc<JwkCacheEntry>> {
@@ -115,7 +119,9 @@ impl JwkCacheEntryLock {
             }
         }
 
-        let rules = auth_rules.fetch_auth_rules(role_name).await?;
+        let rules = auth_rules
+            .fetch_auth_rules(ctx, endpoint, role_name)
+            .await?;
         let mut key_sets =
             ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
         // TODO(conrad): run concurrently
@@ -166,6 +172,7 @@ impl JwkCacheEntryLock {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
@@ -176,7 +183,9 @@ impl JwkCacheEntryLock {
         let Some(cached) = guard else {
             let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self
+                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
+                .await;
         };
 
         let last_update = now.duration_since(cached.last_retrieved);
@@ -187,7 +196,9 @@ impl JwkCacheEntryLock {
             let permit = self.acquire_permit().await;
 
             // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self
+                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
+                .await;
         }
 
         // every 5 minutes we should spawn a job to eagerly update the token.
@@ -198,8 +209,12 @@ impl JwkCacheEntryLock {
                 let entry = self.clone();
                 let client = client.clone();
                 let fetch = fetch.clone();
+                let ctx = ctx.clone();
                 tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
+                    if let Err(e) = entry
+                        .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
+                        .await
+                    {
                         tracing::warn!(error=?e, "could not fetch JWKs in background job");
                     }
                 });
@@ -216,6 +231,7 @@ impl JwkCacheEntryLock {
         ctx: &RequestMonitoring,
         jwt: &str,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         fetch: &F,
     ) -> Result<(), anyhow::Error> {
@@ -242,7 +258,7 @@ impl JwkCacheEntryLock {
         let kid = header.key_id.context("missing key id")?;
 
         let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
+            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
             .await?;
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
@@ -254,7 +270,14 @@ impl JwkCacheEntryLock {
 
                     let permit = self.acquire_permit().await;
                     guard = self
-                        .renew_jwks(permit, client, role_name.clone(), fetch)
+                        .renew_jwks(
+                            permit,
+                            ctx,
+                            client,
+                            endpoint.clone(),
+                            role_name.clone(),
+                            fetch,
+                        )
                         .await?;
                 }
                 _ => {
@@ -318,7 +341,7 @@ impl JwkCache {
         jwt: &str,
     ) -> Result<(), anyhow::Error> {
         // try with just a read lock first
-        let key = (endpoint, role_name.clone());
+        let key = (endpoint.clone(), role_name.clone());
         let entry = self.map.get(&key).as_deref().map(Arc::clone);
         let entry = entry.unwrap_or_else(|| {
             // acquire a write lock after to insert.
@@ -327,7 +350,7 @@ impl JwkCache {
         });
 
         entry
-            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
+            .check_jwt(ctx, jwt, &self.client, endpoint, role_name, fetch)
             .await
     }
 }
@@ -688,6 +711,8 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         impl FetchAuthRules for Fetch {
             async fn fetch_auth_rules(
                 &self,
+                _ctx: &RequestMonitoring,
+                _endpoint: EndpointId,
                 _role_name: RoleName,
             ) -> anyhow::Result<Vec<AuthRule>> {
                 Ok(vec![
@@ -706,6 +731,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         }
 
         let role_name = RoleName::from("user");
+        let endpoint = EndpointId::from("ep");
 
         let jwk_cache = Arc::new(JwkCacheEntryLock::default());
 
@@ -715,6 +741,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                     &RequestMonitoring::test(),
                     &token,
                     &client,
+                    endpoint.clone(),
                     role_name.clone(),
                     &Fetch(addr),
                 )
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 8124f568cf..2ff2ca00f0 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -9,8 +9,9 @@ use crate::{
         messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
         NodeInfo,
     },
+    context::RequestMonitoring,
     intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
-    RoleName,
+    EndpointId, RoleName,
 };
 
 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
@@ -57,7 +58,12 @@ pub struct JwksRoleSettings {
 }
 
 impl FetchAuthRules for StaticAuthRules {
-    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
+    async fn fetch_auth_rules(
+        &self,
+        _ctx: &RequestMonitoring,
+        _endpoint: EndpointId,
+        role_name: RoleName,
+    ) -> anyhow::Result<Vec<AuthRule>> {
         let mappings = JWKS_ROLE_MAP.load();
         let role_mappings = mappings
             .as_deref()
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 12a6e2f12a..16e8da605b 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -303,6 +303,7 @@ impl NodeInfo {
 
     pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
         match keys {
+            #[cfg(any(test, feature = "testing"))]
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
             ComputeCredentialKeys::None => &mut self.config,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index c013218ad9..021659e175 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -79,6 +79,40 @@ pub(crate) enum AuthMethod {
     Cleartext,
 }
 
+impl Clone for RequestMonitoring {
+    fn clone(&self) -> Self {
+        let inner = self.0.try_lock().expect("should not deadlock");
+        let new = RequestMonitoringInner {
+            peer_addr: inner.peer_addr,
+            session_id: inner.session_id,
+            protocol: inner.protocol,
+            first_packet: inner.first_packet,
+            region: inner.region,
+            span: info_span!("background_task"),
+
+            project: inner.project,
+            branch: inner.branch,
+            endpoint_id: inner.endpoint_id.clone(),
+            dbname: inner.dbname.clone(),
+            user: inner.user.clone(),
+            application: inner.application.clone(),
+            error_kind: inner.error_kind,
+            auth_method: inner.auth_method.clone(),
+            success: inner.success,
+            rejected: inner.rejected,
+            cold_start_info: inner.cold_start_info,
+            pg_options: inner.pg_options.clone(),
+
+            sender: None,
+            disconnect_sender: None,
+            latency_timer: LatencyTimer::noop(inner.protocol),
+            disconnect_timestamp: inner.disconnect_timestamp,
+        };
+
+        Self(TryLock::new(new))
+    }
+}
+
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 2da7eac580..c2567e083a 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -397,6 +397,8 @@ pub struct LatencyTimer {
     protocol: Protocol,
     cold_start_info: ColdStartInfo,
     outcome: ConnectOutcome,
+
+    skip_reporting: bool,
 }
 
 impl LatencyTimer {
@@ -409,6 +411,20 @@ impl LatencyTimer {
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
             outcome: ConnectOutcome::Failed,
+            skip_reporting: false,
+        }
+    }
+
+    pub(crate) fn noop(protocol: Protocol) -> Self {
+        Self {
+            start: time::Instant::now(),
+            stop: None,
+            accumulated: Accumulated::default(),
+            protocol,
+            cold_start_info: ColdStartInfo::Unknown,
+            // assume failed unless otherwise specified
+            outcome: ConnectOutcome::Failed,
+            skip_reporting: true,
         }
     }
 
@@ -443,6 +459,10 @@ pub enum ConnectOutcome {
 
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
+        if self.skip_reporting {
+            return;
+        }
+
         let duration = self
             .stop
             .unwrap_or_else(time::Instant::now)
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index d163878528..aa236907db 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     Host,
 };
 
-use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 
 pub(crate) struct PoolingBackend {
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -274,13 +274,6 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
-        match &self.conn_info.auth {
-            AuthData::Jwt(_) => {}
-            AuthData::Password(pw) => {
-                config.password(pw);
-            }
-        }
-
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 6c32d5df0e..a850ecd2be 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -29,11 +29,16 @@ use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 
+#[derive(Debug, Clone)]
+pub(crate) struct ConnInfoWithAuth {
+    pub(crate) conn_info: ConnInfo,
+    pub(crate) auth: AuthData,
+}
+
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
     pub(crate) user_info: ComputeUserInfo,
     pub(crate) dbname: DbName,
-    pub(crate) auth: AuthData,
 }
 
 #[derive(Debug, Clone)]
@@ -787,7 +792,6 @@ mod tests {
                 options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
-            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -845,7 +849,6 @@ mod tests {
                 options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
-            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 06e540d149..7c78439a0a 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -60,6 +60,7 @@ use super::backend::PoolingBackend;
 use super::conn_pool::AuthData;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
+use super::conn_pool::ConnInfoWithAuth;
 use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -148,7 +149,7 @@ fn get_conn_info(
     ctx: &RequestMonitoring,
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
-) -> Result<ConnInfo, ConnInfoError> {
+) -> Result<ConnInfoWithAuth, ConnInfoError> {
     // HTTP only uses cleartext (for now and likely always)
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -235,11 +236,8 @@ fn get_conn_info(
         options: options.unwrap_or_default(),
     };
 
-    Ok(ConnInfo {
-        user_info,
-        dbname,
-        auth,
-    })
+    let conn_info = ConnInfo { user_info, dbname };
+    Ok(ConnInfoWithAuth { conn_info, auth })
 }
 
 // TODO: return different http error codes
@@ -523,7 +521,10 @@ async fn handle_inner(
 
     // TLS config should be there.
     let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
-    info!(user = conn_info.user_info.user.as_str(), "credentials");
+    info!(
+        user = conn_info.conn_info.user_info.user.as_str(),
+        "credentials"
+    );
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
@@ -568,20 +569,20 @@ async fn handle_inner(
                         .authenticate_with_password(
                             ctx,
                             &config.authentication_config,
-                            &conn_info.user_info,
+                            &conn_info.conn_info.user_info,
                             pw,
                         )
                         .await?
                 }
                 AuthData::Jwt(jwt) => {
                     backend
-                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+                        .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt)
                         .await?
                 }
             };
 
             let client = backend
-                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+                .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool)
                 .await?;
             // not strictly necessary to mark success here,
             // but it's just insurance for if we forget it somewhere else
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cbbb162cc6..fc83cf3f7c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3863,9 +3863,6 @@ def static_proxy(
     dbname = vanilla_pg.default_options["dbname"]
     auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"
 
-    # require password for 'http_auth' user
-    vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"])
-
     # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
     vanilla_pg.start()
     vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")

From 1708743e786cde41eb1bba51d4e5267895d8227d Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 19 Sep 2024 12:27:10 -0400
Subject: [PATCH 096/142] pageserver: wait for lsn lease duration after
 transition into AttachedSingle (#9024)

Part of #7497, closes https://github.com/neondatabase/neon/issues/8890.

## Problem

Since leases are in-memory objects, we need to take special care of them
after pageserver restarts and while doing a live migration. The approach
we took for pageserver restart is to wait for at least lease duration
before doing first GC. We want to do the same for live migration. Since
we do not do any GC when a tenant is in `AttachedStale` or
`AttachedMulti` mode, only the transition from `AttachedMulti` to
`AttachedSingle` requires this treatment.

## Summary of changes

- Added `lsn_lease_deadline` field in `GcBlock::reasons`: the tenant is
temporarily blocked from GC until we reach the deadline. This
information does not persist to S3.
- In `GCBlock::start`, skip the GC iteration if we are blocked by the
lsn lease deadline.
- In `TenantManager::upsert_location`, set the lsn_lease_deadline to
`Instant::now() + lsn_lease_length` so the granted leases have a chance
to be renewed before we run GC for the first time after transitioned
from AttachedMulti to AttachedSingle.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/gc_block.rs             | 83 ++++++++++++++-----
 pageserver/src/tenant/mgr.rs                  |  6 ++
 pageserver/src/tenant/tasks.rs                | 18 +---
 test_runner/regress/test_branch_and_gc.py     |  1 +
 test_runner/regress/test_branch_behind.py     |  4 +-
 test_runner/regress/test_branching.py         |  2 +-
 test_runner/regress/test_compaction.py        |  1 +
 test_runner/regress/test_hot_standby.py       |  2 +-
 test_runner/regress/test_layer_eviction.py    |  1 +
 .../regress/test_pageserver_generations.py    |  1 +
 test_runner/regress/test_remote_storage.py    |  2 +
 test_runner/regress/test_sharding.py          |  1 +
 .../regress/test_storage_controller.py        |  2 +-
 test_runner/regress/test_storage_scrubber.py  |  1 +
 test_runner/regress/test_tenant_detach.py     |  4 +-
 .../regress/test_timeline_gc_blocking.py      |  5 +-
 16 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index 8b41ba1746..1271d25b76 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,11 +1,29 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
+use std::{collections::HashMap, time::Duration};
 
 use super::remote_timeline_client::index::GcBlockingReason;
+use tokio::time::Instant;
+use utils::id::TimelineId;
 
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
 
+#[derive(Default)]
+struct Storage {
+    timelines_blocked: TimelinesBlocked,
+    /// The deadline before which we are blocked from GC so that
+    /// leases have a chance to be renewed.
+    lsn_lease_deadline: Option<Instant>,
+}
+
+impl Storage {
+    fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
+        self.lsn_lease_deadline
+            .map(|d| Instant::now() < d)
+            .unwrap_or(false)
+    }
+}
+
+/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
+/// blocking.
 #[derive(Default)]
 pub(crate) struct GcBlock {
     /// The timelines which have current reasons to block gc.
@@ -13,6 +31,12 @@ pub(crate) struct GcBlock {
     /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
     /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
     reasons: std::sync::Mutex<Storage>,
+
+    /// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
+    ///
+    /// Do not add any more features taking and forbidding taking this lock. It should be
+    /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
+    /// synchronizes with gc attempts by locking and unlocking this mutex.
     blocking: tokio::sync::Mutex<()>,
 }
 
@@ -42,6 +66,20 @@ impl GcBlock {
         }
     }
 
+    /// Sets a deadline before which we cannot proceed to GC due to lsn lease.
+    ///
+    /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
+    /// length, we guarantee that all the leases we granted before will have a chance to renew
+    /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
+    pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
+        let deadline = Instant::now() + lsn_lease_length;
+        let mut g = self.reasons.lock().unwrap();
+        g.lsn_lease_deadline = Some(deadline);
+    }
+
+    /// Describe the current gc blocking reasons.
+    ///
+    /// TODO: make this json serializable.
     pub(crate) fn summary(&self) -> Option<BlockingReasons> {
         let g = self.reasons.lock().unwrap();
 
@@ -64,7 +102,7 @@ impl GcBlock {
     ) -> anyhow::Result<bool> {
         let (added, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
+            let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
             let added = set.insert(reason);
 
             // LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -95,7 +133,7 @@ impl GcBlock {
 
         let (remaining_blocks, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
+            match g.timelines_blocked.entry(timeline.timeline_id) {
                 Entry::Occupied(mut oe) => {
                     let set = oe.get_mut();
                     set.remove(reason);
@@ -109,7 +147,7 @@ impl GcBlock {
                 }
             }
 
-            let remaining_blocks = g.len();
+            let remaining_blocks = g.timelines_blocked.len();
 
             // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
             let uploaded = timeline
@@ -134,11 +172,11 @@ impl GcBlock {
     pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
         let unblocked = {
             let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
+            if g.timelines_blocked.is_empty() {
                 return;
             }
 
-            g.remove(&timeline.timeline_id);
+            g.timelines_blocked.remove(&timeline.timeline_id);
 
             BlockingReasons::clean_and_summarize(g).is_none()
         };
@@ -149,10 +187,11 @@ impl GcBlock {
     }
 
     /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
+    pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
         let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+        assert!(g.timelines_blocked.is_empty());
+        g.timelines_blocked
+            .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
 
         if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
             tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -166,6 +205,7 @@ pub(super) struct Guard<'a> {
 
 #[derive(Debug)]
 pub(crate) struct BlockingReasons {
+    tenant_blocked_by_lsn_lease_deadline: bool,
     timelines: usize,
     reasons: enumset::EnumSet<GcBlockingReason>,
 }
@@ -174,8 +214,8 @@ impl std::fmt::Display for BlockingReasons {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
+            "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
+            self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
         )
     }
 }
@@ -183,13 +223,15 @@ impl std::fmt::Display for BlockingReasons {
 impl BlockingReasons {
     fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
         let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
+        g.timelines_blocked.retain(|_key, value| {
             reasons = reasons.union(*value);
             !value.is_empty()
         });
-        if !g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
             Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                 reasons,
             })
         } else {
@@ -198,14 +240,17 @@ impl BlockingReasons {
     }
 
     fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
             None
         } else {
             let reasons = g
+                .timelines_blocked
                 .values()
                 .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
             Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                 reasons,
             })
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2104f41531..1e7c1e10a5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -949,6 +949,12 @@ impl TenantManager {
                 (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                     match attach_conf.generation.cmp(&tenant.generation) {
                         Ordering::Equal => {
+                            if attach_conf.attach_mode == AttachmentMode::Single {
+                                tenant
+                                    .gc_block
+                                    .set_lsn_lease_deadline(tenant.get_lsn_lease_length());
+                            }
+
                             // A transition from Attached to Attached in the same generation, we may
                             // take our fast path and just provide the updated configuration
                             // to the tenant.
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 478e9bb4f0..57f0123d8f 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -346,6 +346,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
 
         let mut first = true;
+        tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
         loop {
             tokio::select! {
                 _ = cancel.cancelled() => {
@@ -363,7 +364,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 first = false;
 
                 let delays = async {
-                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
                     random_init_delay(period, &cancel).await?;
                     Ok::<_, Cancelled>(())
                 };
@@ -538,28 +538,12 @@ pub(crate) async fn random_init_delay(
         let mut rng = rand::thread_rng();
         rng.gen_range(Duration::ZERO..=period)
     };
-
     match tokio::time::timeout(d, cancel.cancelled()).await {
         Ok(_) => Err(Cancelled),
         Err(_) => Ok(()),
     }
 }
 
-/// Delays GC by defaul lease length at restart.
-///
-/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
-/// length, we gurantees that all the leases we granted before the restart will expire
-/// when we run GC for the first time after the restart.
-pub(crate) async fn delay_by_lease_length(
-    length: Duration,
-    cancel: &CancellationToken,
-) -> Result<(), Cancelled> {
-    match tokio::time::timeout(length, cancel.cancelled()).await {
-        Ok(_) => Err(Cancelled),
-        Err(_) => Ok(()),
-    }
-}
-
 struct Iteration {
     started_at: Instant,
     period: Duration,
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index f2e3855c12..d7c4cf059a 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -142,6 +142,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
             "image_creation_threshold": "1",
             # set PITR interval to be small, so we can do GC
             "pitr_interval": "0 s",
+            "lsn_lease_length": "0s",
         }
     )
 
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 0a5336f5a2..2bf7041cf1 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -11,7 +11,9 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"pitr_interval": "0 sec", "lsn_lease_length": "0s"}
+    )
 
     error_regexes = [
         ".*invalid branch start lsn.*",
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 1729e2fc98..3d5c34a595 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -419,7 +419,7 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
 
 
 def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
 
     client = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index be787e0642..cb34551b53 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -240,6 +240,7 @@ def test_uploads_and_deletions(
         "image_creation_threshold": "1",
         "image_layer_creation_check_threshold": "0",
         "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}),
+        "lsn_lease_length": "0s",
     }
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
 
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index d94704012f..35e0c0decb 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -222,7 +222,7 @@ def pgbench_accounts_initialized(ep):
 # Without hs feedback enabled we'd see 'User query might have needed to see row
 # versions that must be removed.' errors.
 def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
     agressive_vacuum_conf = [
         "log_autovacuum_min_duration = 0",
         "autovacuum_naptime = 10s",
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 193149ea03..97093ea535 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -173,6 +173,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
         # "image_creation_threshold": set at runtime
         "compaction_target_size": f"{128 * (1024**2)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
         "image_layer_creation_check_threshold": "0",  # always check if a new image layer can be created
+        "lsn_lease_length": "0s",
     }
 
     def tenant_update_config(changes):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index c923713432..519994f774 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -53,6 +53,7 @@ TENANT_CONF = {
     # create image layers eagerly, so that GC can remove some layers
     "image_creation_threshold": "1",
     "image_layer_creation_check_threshold": "0",
+    "lsn_lease_length": "0s",
 }
 
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 2e5260ca78..0a57fc9605 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -244,6 +244,7 @@ def test_remote_storage_upload_queue_retries(
             # create image layers eagerly, so that GC can remove some layers
             "image_creation_threshold": "1",
             "image_layer_creation_check_threshold": "0",
+            "lsn_lease_length": "0s",
         }
     )
 
@@ -391,6 +392,7 @@ def test_remote_timeline_client_calls_started_metric(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
+            "lsn_lease_length": "0s",
         }
     )
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 4a84dca399..1eb33b2d39 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -200,6 +200,7 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
         # Disable automatic creation of image layers, as we will create them explicitly when we want them
         "image_creation_threshold": 9999,
         "image_layer_creation_check_threshold": 0,
+        "lsn_lease_length": "0s",
     }
 
     neon_env_builder.storage_controller_config = {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index dc90a6e9a0..4106efd4f9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -485,7 +485,7 @@ def test_storage_controller_compute_hook(
     httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
 
     # Start running
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 848e214c5e..b6c19f03f6 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -204,6 +204,7 @@ def test_scrubber_physical_gc_ancestors(
             # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
             # and makes them GC'able
             "pitr_interval": "0s",
+            "lsn_lease_length": "0s",
         },
     )
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index b165588636..e7c6d5a4c3 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -266,13 +266,13 @@ def test_tenant_reattach_while_busy(
 
 
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
     pageserver_http = env.pageserver.http_client()
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
     # create new nenant
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
 
     # assert tenant exists on disk
     assert env.pageserver.tenant_dir(tenant_id).exists()
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index ddfe9b911f..765c72cf2a 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -45,7 +45,10 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
     tenant_after = http.tenant_status(env.initial_tenant)
     assert tenant_before != tenant_after
     gc_blocking = tenant_after["gc_blocking"]
-    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
+    assert (
+        gc_blocking
+        == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }"
+    )
 
     wait_for_another_gc_round()
     pss.assert_log_contains(gc_skipped_line)

From d0cbfda15c916ee75066919940d0c3da714c5b95 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Sep 2024 16:29:28 -0400
Subject: [PATCH 097/142] refactor(pageserver): check layer map valid in one
 place (#9051)

We have 3 places where we implement layer map checks.

## Summary of changes

Now we have a single check function being called in all places.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 31 +++++++++++
 pageserver/src/tenant/checks.rs              | 55 +++++++++++++++++++
 pageserver/src/tenant/timeline.rs            | 33 ++----------
 pageserver/src/tenant/timeline/compaction.rs | 21 +++-----
 storage_scrubber/src/checks.rs               | 56 ++------------------
 5 files changed, 101 insertions(+), 95 deletions(-)
 create mode 100644 pageserver/src/tenant/checks.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 14cb6f508d..d699d56075 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -140,6 +140,7 @@ pub mod metadata;
 pub mod remote_timeline_client;
 pub mod storage_layer;
 
+pub mod checks;
 pub mod config;
 pub mod mgr;
 pub mod secondary;
@@ -1573,6 +1574,9 @@ impl Tenant {
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
         let tline = self
             .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
             .await?;
@@ -1587,6 +1591,18 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                 .await?;
         }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
         Ok(tline)
     }
 
@@ -3197,6 +3213,9 @@ impl Tenant {
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
         let tline = self
             .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
             .await?;
@@ -3217,6 +3236,18 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
                 .await?;
         }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
         Ok(tline)
     }
 
diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs
new file mode 100644
index 0000000000..8eaa8a001c
--- /dev/null
+++ b/pageserver/src/tenant/checks.rs
@@ -0,0 +1,55 @@
+use std::collections::BTreeSet;
+
+use itertools::Itertools;
+
+use super::storage_layer::LayerName;
+
+/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+///
+/// ```plain
+/// |       |                 |       |
+/// |   1   |    |   2   |    |   3   |
+/// |       |    |       |    |       |
+/// ```
+///
+/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
+/// the same LSN range.
+///
+/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
+///
+/// ```plain
+/// |       |    |   2   |    |       |
+/// |   1   |    |-------|    |   3   |
+/// |       |    |   4   |    |       |
+///
+/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
+    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+    let mut all_delta_layers = Vec::new();
+    for name in metadata {
+        if let LayerName::Delta(layer) = name {
+            if layer.key_range.start.next() != layer.key_range.end {
+                all_delta_layers.push(layer.clone());
+            }
+        }
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = &layer.lsn_range;
+        lsn_split_point.insert(lsn_range.start);
+        lsn_split_point.insert(lsn_range.end);
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = layer.lsn_range.clone();
+        let intersects = lsn_split_point.range(lsn_range).collect_vec();
+        if intersects.len() > 1 {
+            let err = format!(
+                "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                layer,
+                intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+            );
+            return Some(err);
+        }
+    }
+    None
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a06cea2c66..f08f5caf95 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5378,7 +5378,8 @@ impl Timeline {
     /// Force create an image layer and place it into the layer map.
     ///
     /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_image_layer(
         self: &Arc<Timeline>,
@@ -5424,7 +5425,8 @@ impl Timeline {
     /// Force create a delta layer and place it into the layer map.
     ///
     /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_delta_layer(
         self: &Arc<Timeline>,
@@ -5450,33 +5452,6 @@ impl Timeline {
         if let Some(check_start_lsn) = check_start_lsn {
             assert!(deltas.lsn_range.start >= check_start_lsn);
         }
-        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
-        // layers of the same start/end LSN, and so should the force inserted layer
-        {
-            /// Checks if a overlaps with b, assume a/b = [start, end).
-            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-
-            if deltas.key_range.start.next() != deltas.key_range.end {
-                let guard = self.layers.read().await;
-                let mut invalid_layers =
-                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
-                        layer.is_delta()
-                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                        && layer.lsn_range != deltas.lsn_range
-                        // skip single-key layer files
-                        && layer.key_range.start.next() != layer.key_range.end
-                    });
-                if let Some(layer) = invalid_layers.next() {
-                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
-                    panic!(
-                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
-                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
-                    );
-                }
-            }
-        }
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d1f06e3480..d1567b6b39 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
@@ -1788,20 +1789,12 @@ impl Timeline {
                 stat.visit_image_layer(desc.file_size());
             }
         }
-        for layer in &layer_selection {
-            let desc = layer.layer_desc();
-            let key_range = &desc.key_range;
-            if desc.is_delta() && key_range.start.next() != key_range.end {
-                let lsn_range = desc.lsn_range.clone();
-                let intersects = lsn_split_point.range(lsn_range).collect_vec();
-                if intersects.len() > 1 {
-                    bail!(
-                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        desc.key(),
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-                }
-            }
+        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+            .iter()
+            .map(|layer| layer.layer_desc().layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("cannot run gc-compaction because {}", err);
         }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = layer_selection
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 15dfb101b5..de6918b3da 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,7 +1,8 @@
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use itertools::Itertools;
+use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
@@ -48,56 +49,6 @@ impl TimelineAnalysis {
     }
 }
 
-/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
-/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
-///
-/// ```plain
-/// |       |                 |       |
-/// |   1   |    |   2   |    |   3   |
-/// |       |    |       |    |       |
-/// ```
-///
-/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
-/// the same LSN range.
-///
-/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
-///
-/// ```plain
-/// |       |    |   2   |    |       |
-/// |   1   |    |-------|    |   3   |
-/// |       |    |   4   |    |       |
-///
-/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
-fn check_valid_layermap(metadata: &HashMap<LayerName, LayerFileMetadata>) -> Option<String> {
-    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
-    let mut all_delta_layers = Vec::new();
-    for (name, _) in metadata.iter() {
-        if let LayerName::Delta(layer) = name {
-            if layer.key_range.start.next() != layer.key_range.end {
-                all_delta_layers.push(layer.clone());
-            }
-        }
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = &layer.lsn_range;
-        lsn_split_point.insert(lsn_range.start);
-        lsn_split_point.insert(lsn_range.end);
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = layer.lsn_range.clone();
-        let intersects = lsn_split_point.range(lsn_range).collect_vec();
-        if intersects.len() > 1 {
-            let err = format!(
-                        "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        layer,
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-            return Some(err);
-        }
-    }
-    None
-}
-
 pub(crate) async fn branch_cleanup_and_check_errors(
     remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
@@ -177,7 +128,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                         }
                     }
 
-                    if let Some(err) = check_valid_layermap(&index_part.layer_metadata) {
+                    let layer_names = index_part.layer_metadata.keys().cloned().collect_vec();
+                    if let Some(err) = check_valid_layermap(&layer_names) {
                         result.errors.push(format!(
                             "index_part.json contains invalid layer map structure: {err}"
                         ));

From f2c08195f068ec4445347641cdde67eb170e60cc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 17 Sep 2024 11:32:49 +0300
Subject: [PATCH 098/142] Bump vendor/postgres.

Includes PRs:
- ERROR out instead of segfaulting when walsender slots are full.
- logical worker: respond to publisher even under dense stream.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a317b9b5b9..87cb68f899 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7
+Subproject commit 87cb68f899db434cd6f1908cf0ac8fdeafdd88c1
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 6f6d77fb59..72b904c0b3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9
+Subproject commit 72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 0baa7346df..3ec6e2496f 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e
+Subproject commit 3ec6e2496f64c6fec35c67cb82efd6490a6a4738
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 9156d63ce2..5bbb9bd93d 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa
+Subproject commit 5bbb9bd93dd805e90bd8af15d00080363d18ec68
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2c34962bb..6289a53670 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "9156d63ce253bed9d1f76355ceec610e444eaffa"
+    "5bbb9bd93dd805e90bd8af15d00080363d18ec68"
   ],
   "v16": [
     "16.4",
-    "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"
+    "3ec6e2496f64c6fec35c67cb82efd6490a6a4738"
   ],
   "v15": [
     "15.8",
-    "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9"
+    "72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1"
   ],
   "v14": [
     "14.13",
-    "a317b9b5b96978b49e78986697f3dd80d06f99a7"
+    "87cb68f899db434cd6f1908cf0ac8fdeafdd88c1"
   ]
 }

From 3104f0f250e99dea03817c8ee6fd4022844db6ea Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 20 Sep 2024 12:00:05 +0100
Subject: [PATCH 099/142] Safekeeper: fix OpenAPI spec (#9066)

## Problem

Safekeeper's OpenAPI spec is incorrect:

```
Semantic error at paths./v1/tenant/{tenant_id}/timeline/{timeline_id}.get.responses.404.content.application/json.schema.$ref
$refs must reference a valid location in the document
Jump to line 126
```
Checked on https://editor.swagger.io

## Summary of changes
- Add `NotFoundError`
- Add `description` and `license` fields to make Cloud OpenAPI spec
linter happy
---
 safekeeper/src/http/openapi_spec.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml
index 70999853c2..3f14075345 100644
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -1,7 +1,11 @@
 openapi: "3.0.2"
 info:
   title: Safekeeper control API
+  description: Neon Safekeeper API
   version: "1.0"
+  license:
+    name: "Apache"
+    url: https://github.com/neondatabase/neon/blob/main/LICENSE
 
 
 servers:
@@ -386,6 +390,12 @@ components:
         msg:
           type: string
 
+    NotFoundError:
+      type: object
+      properties:
+        msg:
+          type: string
+
   responses:
 
     #

From c45b56e0bb34b70c770eb23b6d13156a0d4f9913 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 20 Sep 2024 15:55:50 +0200
Subject: [PATCH 100/142] pageserver: add counters for started smgr/getpage
 requests (#9069)

After this PR

```
curl localhost:9898/metrics | grep smgr_ | grep start
```

```
pageserver_smgr_query_started_count{shard_id="0000",smgr_query_type="get_page_at_lsn",tenant_id="...",timeline_id="..."} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_db_size"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_page_at_lsn"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_exists"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_size"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_slru_segment"} 0
```

We instantiate the per-tenant counter only for `get_page_at_lsn`.
---
 pageserver/src/metrics.rs       | 102 +++++++++++++++++++++++++-------
 test_runner/fixtures/metrics.py |   2 +
 2 files changed, 81 insertions(+), 23 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 72229d80be..abd814f928 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
 }
 
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_metric: &'a Histogram,
+    global_latency_histo: &'a Histogram,
 
     // Optional because not all op types are tracked per-timeline
-    timeline_metric: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<&'a Histogram>,
 
     ctx: &'c RequestContext,
     start: std::time::Instant,
@@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                 elapsed
             }
         };
-        self.global_metric.observe(ex_throttled.as_secs_f64());
-        if let Some(timeline_metric) = self.timeline_metric {
-            timeline_metric.observe(ex_throttled.as_secs_f64());
+        self.global_latency_histo
+            .observe(ex_throttled.as_secs_f64());
+        if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
         }
     }
 }
@@ -1240,10 +1241,32 @@ pub enum SmgrQueryType {
 
 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    global_metrics: [Histogram; SmgrQueryType::COUNT],
-    per_timeline_getpage: Histogram,
+    global_started: [IntCounter; SmgrQueryType::COUNT],
+    global_latency: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage_started: IntCounter,
+    per_timeline_getpage_latency: Histogram,
 }
 
+static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_global_count",
+        "Number of smgr queries started, aggregated by query type.",
+        &["smgr_query_type"],
+    )
+    .expect("failed to define a metric")
+});
+
+static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_count",
+        "Number of smgr queries started, aggregated by query type and tenant/timeline.",
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds",
@@ -1319,14 +1342,20 @@ impl SmgrQueryTimePerTimeline {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
-        let global_metrics = std::array::from_fn(|i| {
+        let global_started = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            SMGR_QUERY_STARTED_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap()
+        });
+        let global_latency = std::array::from_fn(|i| {
             let op = SmgrQueryType::from_repr(i).unwrap();
             SMGR_QUERY_TIME_GLOBAL
                 .get_metric_with_label_values(&[op.into()])
                 .unwrap()
         });
 
-        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+        let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
             .get_metric_with_label_values(&[
                 SmgrQueryType::GetPageAtLsn.into(),
                 &tenant_id,
@@ -1334,9 +1363,20 @@ impl SmgrQueryTimePerTimeline {
                 &timeline_id,
             ])
             .unwrap();
+        let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+
         Self {
-            global_metrics,
-            per_timeline_getpage,
+            global_started,
+            global_latency,
+            per_timeline_getpage_latency,
+            per_timeline_getpage_started,
         }
     }
     pub(crate) fn start_timer<'c: 'a, 'a>(
@@ -1344,8 +1384,11 @@ impl SmgrQueryTimePerTimeline {
         op: SmgrQueryType,
         ctx: &'c RequestContext,
     ) -> Option<impl Drop + '_> {
-        let global_metric = &self.global_metrics[op as usize];
         let start = Instant::now();
+
+        self.global_started[op as usize].inc();
+
+        // We subtract time spent throttled from the observed latency.
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
             Err(error) => {
@@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline {
             }
         }
 
-        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
-            Some(&self.per_timeline_getpage)
+        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            self.per_timeline_getpage_started.inc();
+            Some(&self.per_timeline_getpage_latency)
         } else {
             None
         };
 
         Some(GlobalAndPerTimelineHistogramTimer {
-            global_metric,
-            timeline_metric,
+            global_latency_histo: &self.global_latency[op as usize],
+            per_timeline_latency_histo,
             ctx,
             start,
             op,
@@ -1423,9 +1467,12 @@ mod smgr_query_time_tests {
             let get_counts = || {
                 let global: u64 = ops
                     .iter()
-                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
+                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
                     .sum();
-                (global, metrics.per_timeline_getpage.get_sample_count())
+                (
+                    global,
+                    metrics.per_timeline_getpage_latency.get_sample_count(),
+                )
             };
 
             let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -2576,6 +2623,12 @@ impl TimelineMetrics {
             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
         }
 
+        let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
         let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
             SmgrQueryType::GetPageAtLsn.into(),
             tenant_id,
@@ -3227,11 +3280,14 @@ pub fn preinitialize_metrics() {
     }
 
     // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
+    [
+        &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
+        &SMGR_QUERY_STARTED_GLOBAL,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });
 
     // gauges
     WALRECEIVER_ACTIVE_MANAGERS.get();
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index cda70be8da..d2db40897e 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -132,6 +132,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
     *histogram("pageserver_io_operations_seconds"),
+    "pageserver_smgr_query_started_global_count_total",
     "pageserver_tenant_states_count",
     "pageserver_circuit_breaker_broken_total",
     "pageserver_circuit_breaker_unbroken_total",
@@ -146,6 +147,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
+    "pageserver_smgr_query_started_count_total",
     "pageserver_archive_size",
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",

From 797aa4ffaaacba20a1b27344d28d1c1a0c287e8c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 20 Sep 2024 17:22:58 +0300
Subject: [PATCH 101/142] Skip running clippy in --release mode. (#9073)

It's pretty expensive to run, and there is very little difference
between debug and release builds that could lead to different clippy
warnings.

This is extracted from PR #8912. That PR wandered off into various
improvements we could make, but we seem to have consensus on this part
at least.
---
 .github/workflows/build_and_test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a210c962cb..c1ec3f207b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -159,6 +159,10 @@ jobs:
       # This will catch compiler & clippy warnings in all feature combinations.
       # TODO: use cargo hack for build and test as well, but, that's quite expensive.
       # NB: keep clippy args in sync with ./run_clippy.sh
+      #
+      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
+      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
+      # time just for that, so skip "clippy --release".
       - run: |
           CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
           if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
@@ -168,8 +172,6 @@ jobs:
           echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
       - name: Run cargo clippy (debug)
         run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
-      - name: Run cargo clippy (release)
-        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       - name: Check documentation generation
         run: cargo doc --workspace --no-deps --document-private-items

From 6b9323027085c771ad71686b39f43f19f1a702f6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 20 Sep 2024 10:37:28 -0400
Subject: [PATCH 102/142] fix(pageserver): receive body error now 500 (#9052)

close https://github.com/neondatabase/neon/issues/8903

In https://github.com/neondatabase/neon/issues/8903 we observed JSON
decoding error to have the following error message in the log:

```
Error processing HTTP request: Resource temporarily unavailable: 3956 (pageserver-6.ap-southeast-1.aws.neon.tech) error receiving body: error decoding response body
```

This is hard to understand. In this patch, we make the error message
more reasonable.

## Summary of changes

* receive body error is now an internal server error, passthrough the
`reqwest::Error` (only decoding error) as `anyhow::Error`.
* instead of formatting the error using `to_string`, we use the
alternative `anyhow::Error` formatting, so that it prints out the cause
of the error (i.e., what exactly cannot serde decode).

I would expect seeing something like `error receiving body: error
decoding response body: XXX field not found` after this patch, though I
didn't set up a testing environment to observe the exact behavior.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/utils/src/http/error.rs      |  2 +-
 storage_controller/src/service.rs | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 3d863a6518..5e05e4e713 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -82,7 +82,7 @@ impl ApiError {
                 StatusCode::INTERNAL_SERVER_ERROR,
             ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
+                format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
                 StatusCode::INTERNAL_SERVER_ERROR,
             ),
         }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index be3efaf688..957f633feb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3,6 +3,7 @@ use std::{
     borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
+    error::Error,
     ops::Deref,
     path::PathBuf,
     str::FromStr,
@@ -218,9 +219,16 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
                 format!("{node} error receiving error body: {str}").into(),
             )
         }
-        mgmt_api::Error::ReceiveBody(str) => {
-            // Presume errors receiving body are connectivity/availability issues
-            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
+        mgmt_api::Error::ReceiveBody(err) if err.is_decode() => {
+            // Return 500 for decoding errors.
+            ApiError::InternalServerError(anyhow::Error::from(err).context("error decoding body"))
+        }
+        mgmt_api::Error::ReceiveBody(err) => {
+            // Presume errors receiving body are connectivity/availability issues except for decoding errors
+            let src_str = err.source().map(|e| e.to_string()).unwrap_or_default();
+            ApiError::ResourceUnavailable(
+                format!("{node} error receiving error body: {err} {}", src_str).into(),
+            )
         }
         mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
             ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())

From e675a21346812b109eddc86c5729ec83e25f845d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 20 Sep 2024 16:09:39 +0100
Subject: [PATCH 103/142] utils: leaky bucket should only report throttled if
 the notify queue is blocked on sleep (#9072)

## Problem

Seems that PS might be too eager in reporting throttled tasks

## Summary of changes

Introduce a sleep counter. If the sleep counter increases, then the
acquire tasks was throttled.
---
 libs/utils/src/leaky_bucket.rs | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs
index a120dc0ac5..0cc58738c0 100644
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -21,7 +21,13 @@
 //!
 //! Another explaination can be found here: <https://brandur.org/rate-limiting>
 
-use std::{sync::Mutex, time::Duration};
+use std::{
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Mutex,
+    },
+    time::Duration,
+};
 
 use tokio::{sync::Notify, time::Instant};
 
@@ -128,6 +134,7 @@ impl LeakyBucketState {
 
 pub struct RateLimiter {
     pub config: LeakyBucketConfig,
+    pub sleep_counter: AtomicU64,
     pub state: Mutex<LeakyBucketState>,
     /// a queue to provide this fair ordering.
     pub queue: Notify,
@@ -144,6 +151,7 @@ impl Drop for Requeue<'_> {
 impl RateLimiter {
     pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
         RateLimiter {
+            sleep_counter: AtomicU64::new(0),
             state: Mutex::new(LeakyBucketState::with_initial_tokens(
                 &config,
                 initial_tokens,
@@ -163,15 +171,16 @@ impl RateLimiter {
 
     /// returns true if we did throttle
     pub async fn acquire(&self, count: usize) -> bool {
-        let mut throttled = false;
-
         let start = tokio::time::Instant::now();
 
+        let start_count = self.sleep_counter.load(Ordering::Acquire);
+        let mut end_count = start_count;
+
         // wait until we are the first in the queue
         let mut notified = std::pin::pin!(self.queue.notified());
         if !notified.as_mut().enable() {
-            throttled = true;
             notified.await;
+            end_count = self.sleep_counter.load(Ordering::Acquire);
         }
 
         // notify the next waiter in the queue when we are done.
@@ -184,9 +193,22 @@ impl RateLimiter {
                 .unwrap()
                 .add_tokens(&self.config, start, count as f64);
             match res {
-                Ok(()) => return throttled,
+                Ok(()) => return end_count > start_count,
                 Err(ready_at) => {
-                    throttled = true;
+                    struct Increment<'a>(&'a AtomicU64);
+
+                    impl Drop for Increment<'_> {
+                        fn drop(&mut self) {
+                            self.0.fetch_add(1, Ordering::AcqRel);
+                        }
+                    }
+
+                    // increment the counter after we finish sleeping (or cancel this task).
+                    // this ensures that tasks that have already started the acquire will observe
+                    // the new sleep count when they are allowed to resume on the notify.
+                    let _inc = Increment(&self.sleep_counter);
+                    end_count += 1;
+
                     tokio::time::sleep_until(ready_at).await;
                 }
             }

From 6014f15157b3a47f122b814b3cf109e9d1851abd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 20 Sep 2024 17:07:09 +0100
Subject: [PATCH 104/142] pageserver: suppress noisy "layer became visible"
 logs (#9064)

## Problem

When layer visibility was added, an info log was included for the
situation where actual access to a layer disagrees with the visibility
calculation. This situation is safe, but I was interested in seeing when
it happens.

The log is pretty high volume, so this PR refines it to fire less often.

## Summary of changes

- For cases where accessing non-visible layers is normal, don't log at
all.
- Extend a unit test to increase confidence that the updates to
visibility on access are working as expected
- During compaction, only call the visibility calculation routine if
some image layers were created: previously, frequent calls resulted in
the visibility of layers getting reset every time we passed through
create_image_layers.
---
 pageserver/src/tenant/storage_layer/layer.rs  | 29 +++++++++++++++----
 .../src/tenant/storage_layer/layer/tests.rs   |  9 ++++++
 pageserver/src/tenant/timeline.rs             |  4 ++-
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b15cd4da39..f0e2ca5c83 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -439,11 +439,30 @@ impl Layer {
 
     fn record_access(&self, ctx: &RequestContext) {
         if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
+            // Visibility was modified to Visible: maybe log about this
+            match ctx.task_kind() {
+                TaskKind::CalculateSyntheticSize
+                | TaskKind::GarbageCollector
+                | TaskKind::MgmtRequest => {
+                    // This situation is expected in code paths do binary searches of the LSN space to resolve
+                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
+                    // and on-demand for certain HTTP API requests.
+                }
+                _ => {
+                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
+                    // some branch tip, so we log the fact that we are accessing something that the visibility
+                    // calculation thought should not be visible.
+                    //
+                    // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
+                    // which was covered by a concurrent compaction.
+                    tracing::info!(
+                        "Layer {} became visible as a result of access",
+                        self.0.desc.key()
+                    );
+                }
+            }
+
+            // Update the timeline's visible bytes count
             if let Some(tl) = self.0.timeline.upgrade() {
                 tl.metrics
                     .visible_physical_size_gauge
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 0b9bde4f57..9de70f14ee 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1025,6 +1025,15 @@ fn access_stats() {
     assert_eq!(access_stats.latest_activity(), lowres_time(atime));
     access_stats.set_visibility(LayerVisibilityHint::Visible);
     assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+
+    // Recording access implicitly makes layer visible, if it wasn't already
+    let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
+    assert!(access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert!(!access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
 }
 
 #[test]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f08f5caf95..114a6dd468 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4316,7 +4316,9 @@ impl Timeline {
         timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await?;
+        if !image_layers.is_empty() {
+            self.update_layer_visibility().await?;
+        }
 
         Ok(image_layers)
     }

From ec5dce04ebfa51b727dfc9bc04ebb1e68aef6434 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 20 Sep 2024 18:48:26 +0200
Subject: [PATCH 105/142] pageserver: throttling: per-tenant metrics + more
 metrics to help understand throttle queue depth (#9077)

---
 pageserver/src/metrics.rs         | 198 +++++++++++++++++++++++++-----
 pageserver/src/tenant.rs          |   4 +-
 pageserver/src/tenant/tasks.rs    |  68 +++++-----
 pageserver/src/tenant/throttle.rs |  34 +++--
 pageserver/src/tenant/timeline.rs |  10 +-
 test_runner/fixtures/metrics.py   |  13 ++
 6 files changed, 250 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index abd814f928..078d12f934 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2645,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
         let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
     }
 
+    tenant_throttling::remove_tenant_metrics(tenant_shard_id);
+
     // we leave the BROKEN_TENANTS_SET entry if any
 }
 
@@ -3108,41 +3110,180 @@ pub mod tokio_epoll_uring {
 pub(crate) mod tenant_throttling {
     use metrics::{register_int_counter_vec, IntCounter};
     use once_cell::sync::Lazy;
+    use utils::shard::TenantShardId;
 
     use crate::tenant::{self, throttle::Metric};
 
-    pub(crate) struct TimelineGet {
-        wait_time: IntCounter,
-        count: IntCounter,
+    struct GlobalAndPerTenantIntCounter {
+        global: IntCounter,
+        per_tenant: IntCounter,
     }
 
-    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
-        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-            "pageserver_tenant_throttling_wait_usecs_sum_global",
-            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+    impl GlobalAndPerTenantIntCounter {
+        #[inline(always)]
+        pub(crate) fn inc(&self) {
+            self.inc_by(1)
+        }
+        #[inline(always)]
+        pub(crate) fn inc_by(&self, n: u64) {
+            self.global.inc_by(n);
+            self.per_tenant.inc_by(n);
+        }
+    }
+
+    pub(crate) struct TimelineGet {
+        count_accounted_start: GlobalAndPerTenantIntCounter,
+        count_accounted_finish: GlobalAndPerTenantIntCounter,
+        wait_time: GlobalAndPerTenantIntCounter,
+        count_throttled: GlobalAndPerTenantIntCounter,
+    }
+
+    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start_global",
+            "Count of tenant throttling starts, by kind of throttle.",
             &["kind"]
         )
-            .unwrap()
-        });
-
-        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-                "pageserver_tenant_throttling_count_global",
-                "Count of tenant throttlings, by kind of throttle.",
-                &["kind"]
-            )
-            .unwrap()
-        });
-
-        let kind = "timeline_get";
-        TimelineGet {
-            wait_time: WAIT_USECS.with_label_values(&[kind]),
-            count: WAIT_COUNT.with_label_values(&[kind]),
-        }
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start",
+            "Count of tenant throttling starts, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish_global",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum_global",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
     });
 
-    impl Metric for &'static TimelineGet {
+    static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_global",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+
+    const KIND: &str = "timeline_get";
+
+    impl TimelineGet {
+        pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            TimelineGet {
+                count_accounted_start: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_accounted_finish: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                wait_time: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_throttled: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+            }
+        }
+    }
+
+    pub(crate) fn preinitialize_global_metrics() {
+        Lazy::force(&COUNT_ACCOUNTED_START);
+        Lazy::force(&COUNT_ACCOUNTED_FINISH);
+        Lazy::force(&WAIT_USECS);
+        Lazy::force(&WAIT_COUNT);
+    }
+
+    pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+        for m in &[
+            &COUNT_ACCOUNTED_START_PER_TENANT,
+            &COUNT_ACCOUNTED_FINISH_PER_TENANT,
+            &WAIT_USECS_PER_TENANT,
+            &WAIT_COUNT_PER_TENANT,
+        ] {
+            let _ = m.remove_label_values(&[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ]);
+        }
+    }
+
+    impl Metric for TimelineGet {
+        #[inline(always)]
+        fn accounting_start(&self) {
+            self.count_accounted_start.inc();
+        }
+        #[inline(always)]
+        fn accounting_finish(&self) {
+            self.count_accounted_finish.inc();
+        }
         #[inline(always)]
         fn observe_throttling(
             &self,
@@ -3150,7 +3291,7 @@ pub(crate) mod tenant_throttling {
         ) {
             let val = u64::try_from(wait_time.as_micros()).unwrap();
             self.wait_time.inc_by(val);
-            self.count.inc();
+            self.count_throttled.inc();
         }
     }
 }
@@ -3309,7 +3450,8 @@ pub fn preinitialize_metrics() {
 
     // Custom
     Lazy::force(&RECONSTRUCT_TIME);
-    Lazy::force(&tenant_throttling::TIMELINE_GET);
     Lazy::force(&BASEBACKUP_QUERY_TIME);
     Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+
+    tenant_throttling::preinitialize_global_metrics();
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d699d56075..e328cd2044 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,7 +302,7 @@ pub struct Tenant {
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
     pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -2831,7 +2831,7 @@ impl Tenant {
             gate: Gate::default(),
             timeline_get_throttle: Arc::new(throttle::Throttle::new(
                 Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                &crate::metrics::tenant_throttling::TIMELINE_GET,
+                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 57f0123d8f..341febb30a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -163,8 +163,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     // How many errors we have seen consequtively
     let mut error_run_count = 0;
 
-    let mut last_throttle_flag_reset_at = Instant::now();
-
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -191,8 +189,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             }
 
-
-
             let sleep_duration;
             if period == Duration::ZERO {
                 #[cfg(not(feature = "testing"))]
@@ -207,12 +203,18 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 };
 
                 // Run compaction
-                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
+                let IterationResult { output, elapsed } = iteration
+                    .run(tenant.compaction_iteration(&cancel, &ctx))
+                    .await;
                 match output {
                     Ok(has_pending_task) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
+                        sleep_duration = if has_pending_task {
+                            Duration::ZERO
+                        } else {
+                            period
+                        };
                     }
                     Err(e) => {
                         let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -233,38 +235,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
 
                 // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
+                tracing::debug!(
+                    elapsed_ms = elapsed.as_millis(),
+                    "compaction iteration complete"
+                );
             };
 
-
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
             if let Some(walredo_mgr) = &tenant.walredo_mgr {
                 walredo_mgr.maybe_quiesce(period * 10);
             }
 
-            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
-                if count_throttled == 0 {
-                    return;
-                }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
-                    count_accounted,
-                    count_throttled,
-                    sum_throttled_usecs,
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
-
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
@@ -437,6 +421,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+    let mut last_throttle_flag_reset_at = Instant::now();
         loop {
             tokio::select! {
                 _ = cancel.cancelled() => {
@@ -483,6 +468,29 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                 kind: BackgroundLoopKind::IngestHouseKeeping,
             };
             iteration.run(tenant.ingest_housekeeping()).await;
+
+            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
+            // Or just spawn another background loop for this throttle, it's not like it's super costly.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                info!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted = count_accounted_finish,  // don't break existing log scraping
+                    count_throttled,
+                    sum_throttled_usecs,
+                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds"
+                );
+            });
         }
     }
     .await;
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index f222e708e1..6a80953901 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
 pub struct Throttle<M: Metric> {
     inner: ArcSwap<Inner>,
     metric: M,
-    /// will be turned into [`Stats::count_accounted`]
-    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_start`]
+    count_accounted_start: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_finish`]
+    count_accounted_finish: AtomicU64,
     /// will be turned into [`Stats::count_throttled`]
     count_throttled: AtomicU64,
     /// will be turned into [`Stats::sum_throttled_usecs`]
@@ -43,17 +45,21 @@ pub struct Observation {
     pub wait_time: Duration,
 }
 pub trait Metric {
+    fn accounting_start(&self);
+    fn accounting_finish(&self);
     fn observe_throttling(&self, observation: &Observation);
 }
 
 /// See [`Throttle::reset_stats`].
 pub struct Stats {
-    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
-    pub count_accounted: u64,
-    // Subset of the `accounted` requests that were actually throttled.
-    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    /// Number of requests that started [`Throttle::throttle`] calls.
+    pub count_accounted_start: u64,
+    /// Number of requests that finished [`Throttle::throttle`] calls.
+    pub count_accounted_finish: u64,
+    /// Subset of the `accounted` requests that were actually throttled.
+    /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
     pub count_throttled: u64,
-    // Sum of microseconds that throttled requests spent waiting for throttling.
+    /// Sum of microseconds that throttled requests spent waiting for throttling.
     pub sum_throttled_usecs: u64,
 }
 
@@ -65,7 +71,8 @@ where
         Self {
             inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
             metric,
-            count_accounted: AtomicU64::new(0),
+            count_accounted_start: AtomicU64::new(0),
+            count_accounted_finish: AtomicU64::new(0),
             count_throttled: AtomicU64::new(0),
             sum_throttled_usecs: AtomicU64::new(0),
         }
@@ -117,11 +124,13 @@ where
     /// This method allows retrieving & resetting that flag.
     /// Useful for periodic reporting.
     pub fn reset_stats(&self) -> Stats {
-        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
+        let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
         let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
         let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
         Stats {
-            count_accounted,
+            count_accounted_start,
+            count_accounted_finish,
             count_throttled,
             sum_throttled_usecs,
         }
@@ -139,9 +148,12 @@ where
         };
         let start = std::time::Instant::now();
 
+        self.metric.accounting_start();
+        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
+        self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
+        self.metric.accounting_finish();
 
-        self.count_accounted.fetch_add(1, Ordering::Relaxed);
         if did_throttle {
             self.count_throttled.fetch_add(1, Ordering::Relaxed);
             let now = Instant::now();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 114a6dd468..c98efd5f71 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -196,9 +196,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    pub timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -406,9 +405,8 @@ pub struct Timeline {
     gc_lock: tokio::sync::Mutex<()>,
 
     /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index d2db40897e..005dc6cb0d 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -102,6 +102,11 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
 
 
+def counter(name: str) -> str:
+    # the prometheus_client package appends _total to all counters client-side
+    return f"{name}_total"
+
+
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_timeline_client_calls_started_total",
     "pageserver_remote_timeline_client_calls_finished_total",
@@ -136,6 +141,10 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_tenant_states_count",
     "pageserver_circuit_breaker_broken_total",
     "pageserver_circuit_breaker_unbroken_total",
+    counter("pageserver_tenant_throttling_count_accounted_start_global"),
+    counter("pageserver_tenant_throttling_count_accounted_finish_global"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
+    counter("pageserver_tenant_throttling_count_global"),
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
@@ -159,6 +168,10 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
+    counter("pageserver_tenant_throttling_count_accounted_start"),
+    counter("pageserver_tenant_throttling_count_accounted_finish"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum"),
+    counter("pageserver_tenant_throttling_count"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From f03f7b38680f68245f2613c5b033ef25e634b73b Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 20 Sep 2024 18:24:40 +0100
Subject: [PATCH 106/142] Bump vendor/postgres to include extension path fix
 (#9076)

This is a pre requisite for
https://github.com/neondatabase/neon/pull/8681
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 87cb68f899..f9c51c1243 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 87cb68f899db434cd6f1908cf0ac8fdeafdd88c1
+Subproject commit f9c51c12438b20049b6905eb4e43d321defd6ff2
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 72b904c0b3..1dbd6f3164 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1
+Subproject commit 1dbd6f316416c8360bbd4f3d6db956cf70937cf0
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3ec6e2496f..d009084a74 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3ec6e2496f64c6fec35c67cb82efd6490a6a4738
+Subproject commit d009084a745cb4d5e6de222c778b2a562c8b2767
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 5bbb9bd93d..dadd6fe208 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 5bbb9bd93dd805e90bd8af15d00080363d18ec68
+Subproject commit dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 6289a53670..c93393970f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "5bbb9bd93dd805e90bd8af15d00080363d18ec68"
+    "dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad"
   ],
   "v16": [
     "16.4",
-    "3ec6e2496f64c6fec35c67cb82efd6490a6a4738"
+    "d009084a745cb4d5e6de222c778b2a562c8b2767"
   ],
   "v15": [
     "15.8",
-    "72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1"
+    "1dbd6f316416c8360bbd4f3d6db956cf70937cf0"
   ],
   "v14": [
     "14.13",
-    "87cb68f899db434cd6f1908cf0ac8fdeafdd88c1"
+    "f9c51c12438b20049b6905eb4e43d321defd6ff2"
   ]
 }

From 9a32aa828d8f2b4ee5f84f81bb5cb3f6012bfeb5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 04:00:38 +0300
Subject: [PATCH 107/142] Fix init of WAL page header at startup (#8914)

If the primary is started at an LSN within the first of a 16 MB WAL
segment, the "long XLOG page header" at the beginning of the segment was
not initialized correctly. That has gone unnnoticed, because under
normal circumstances, nothing looks at the page header. The WAL that is
streamed to the safekeepers starts at the new record's LSN, not at the
beginning of the page, so that bogus page header didn't propagate
elsewhere, and a primary server doesn't normally read the WAL its
written. Which is good because the contents of the page would be bogus
anyway, as it wouldn't contain any of the records before the LSN where
the new record is written.

Except that in the following cases a primary does read its own WAL:

1. When there are two-phase transactions in prepared state at
checkpoint. The checkpointer reads the two-phase state from the
XLOG_XACT_PREPARE record, and writes it to a file in pg_twophase/.

2. Logical decoding reads the WAL starting from the replication slot's
restart LSN.

This PR fixes the problem with two-phase transactions. For that, it's
sufficient to initialize the page header correctly. The checkpointer
only needs to read XLOG_XACT_PREPARE records that were generated after
the server startup, so it's still OK that older WAL is missing / bogus.

I have not investigated if we have a problem with logical decoding,
however. Let's deal with that separately.

Special thanks to @Lzjing-1997, who independently found the same bug
and opened a PR to fix it, although I did not use that PR.
---
 test_runner/regress/test_twophase.py | 31 +++++++++++++++++++++++-----
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 vendor/postgres-v16                  |  2 +-
 vendor/postgres-v17                  |  2 +-
 vendor/revisions.json                |  8 +++----
 6 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index ebe65e7c29..75fab78d6e 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -8,6 +8,7 @@ from fixtures.neon_fixtures import (
     PgBin,
     fork_at_current_lsn,
     import_timeline_from_vanilla_postgres,
+    wait_for_wal_insert_lsn,
 )
 
 
@@ -22,11 +23,6 @@ def twophase_test_on_timeline(env: NeonEnv):
     conn = endpoint.connect()
     cur = conn.cursor()
 
-    # FIXME: Switch to the next WAL segment, to work around the bug fixed in
-    # https://github.com/neondatabase/neon/pull/8914.  When that is merged, this can be
-    # removed.
-    cur.execute("select pg_switch_wal()")
-
     cur.execute("CREATE TABLE foo (t text)")
 
     # Prepare a transaction that will insert a row
@@ -140,3 +136,28 @@ def test_twophase_nonzero_epoch(
     vanilla_pg.stop()  # don't need the original server anymore
 
     twophase_test_on_timeline(env)
+
+
+def test_twophase_at_wal_segment_start(neon_simple_env: NeonEnv):
+    """
+    Same as 'test_twophase' test, but the server is started at an LSN at the beginning
+    of a WAL segment. We had a bug where we didn't initialize the "long XLOG page header"
+    at the beginning of the segment correctly, which was detected when the checkpointer
+    tried to read the XLOG_XACT_PREPARE record from the WAL, if that record was on the
+    very first page of a WAL segment and the server was started up at that first page.
+    """
+    env = neon_simple_env
+    timeline_id = env.neon_cli.create_branch("test_twophase", "main")
+
+    endpoint = env.endpoints.create_start(
+        "test_twophase", config_lines=["max_prepared_transactions=5"]
+    )
+    endpoint.safe_psql("SELECT pg_switch_wal()")
+
+    # to avoid hitting https://github.com/neondatabase/neon/issues/9079, wait for the
+    # WAL to reach the pageserver.
+    wait_for_wal_insert_lsn(env, endpoint, env.initial_tenant, timeline_id)
+
+    endpoint.stop_and_destroy()
+
+    twophase_test_on_timeline(env)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f9c51c1243..a38d15f323 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f9c51c12438b20049b6905eb4e43d321defd6ff2
+Subproject commit a38d15f3233a4c07f2bf3335fcbd874dd1f4e386
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 1dbd6f3164..16c3c6b64f 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 1dbd6f316416c8360bbd4f3d6db956cf70937cf0
+Subproject commit 16c3c6b64f1420a367a2a9b2510f20d94f791af8
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d009084a74..1d7081a3b0 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d009084a745cb4d5e6de222c778b2a562c8b2767
+Subproject commit 1d7081a3b076ddf5086e0b118d4329820e6a7427
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index dadd6fe208..2cf120e739 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad
+Subproject commit 2cf120e7393ca5f537c6a38b457585576dc035fc
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c93393970f..9f6512d03e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad"
+    "2cf120e7393ca5f537c6a38b457585576dc035fc"
   ],
   "v16": [
     "16.4",
-    "d009084a745cb4d5e6de222c778b2a562c8b2767"
+    "1d7081a3b076ddf5086e0b118d4329820e6a7427"
   ],
   "v15": [
     "15.8",
-    "1dbd6f316416c8360bbd4f3d6db956cf70937cf0"
+    "16c3c6b64f1420a367a2a9b2510f20d94f791af8"
   ],
   "v14": [
     "14.13",
-    "f9c51c12438b20049b6905eb4e43d321defd6ff2"
+    "a38d15f3233a4c07f2bf3335fcbd874dd1f4e386"
   ]
 }

From a3800dcb0cbe44678a4d823f324b951ca3a0d4f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 21 Sep 2024 14:36:41 +0200
Subject: [PATCH 108/142] Move load_timeline_metadata into separate function
 (#9080)

Moves the per-timeline code to load timeline metadata into a new
dedicated function called `load_timeline_metadata`. The old
`load_timeline_metadata` becomes `load_timelines_metadata`.

Split out of #8907

Part of #8088
---
 pageserver/src/tenant.rs | 74 +++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e328cd2044..be69f3d67f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,7 +18,6 @@ use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
-use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
@@ -34,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
+use std::future::Future;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -1031,13 +1031,9 @@ impl Tenant {
         }
 
         Ok(TenantPreload {
-            timelines: Self::load_timeline_metadata(
-                self,
-                remote_timeline_ids,
-                remote_storage,
-                cancel,
-            )
-            .await?,
+            timelines: self
+                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+                .await?,
         })
     }
 
@@ -1303,7 +1299,7 @@ impl Tenant {
         .await
     }
 
-    async fn load_timeline_metadata(
+    async fn load_timelines_metadata(
         self: &Arc<Tenant>,
         timeline_ids: HashSet<TimelineId>,
         remote_storage: &GenericRemoteStorage,
@@ -1311,33 +1307,10 @@ impl Tenant {
     ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
         let mut part_downloads = JoinSet::new();
         for timeline_id in timeline_ids {
-            let client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
             let cancel_clone = cancel.clone();
             part_downloads.spawn(
-                async move {
-                    debug!("starting index part download");
-
-                    let index_part = client.download_index_file(&cancel_clone).await;
-
-                    debug!("finished index part download");
-
-                    Result::<_, anyhow::Error>::Ok(TimelinePreload {
-                        client,
-                        timeline_id,
-                        index_part,
-                    })
-                }
-                .map(move |res| {
-                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
-                })
-                .instrument(info_span!("download_index_part", %timeline_id)),
+                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
+                    .instrument(info_span!("download_index_part", %timeline_id)),
             );
         }
 
@@ -1348,8 +1321,7 @@ impl Tenant {
                 next = part_downloads.join_next() => {
                     match next {
                         Some(result) => {
-                            let preload_result = result.context("join preload task")?;
-                            let preload = preload_result?;
+                            let preload = result.context("join preload task")?;
                             timeline_preloads.insert(preload.timeline_id, preload);
                         },
                         None => {
@@ -1366,6 +1338,36 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
+    fn load_timeline_metadata(
+        self: &Arc<Tenant>,
+        timeline_id: TimelineId,
+        remote_storage: GenericRemoteStorage,
+        cancel: CancellationToken,
+    ) -> impl Future<Output = TimelinePreload> {
+        let client = RemoteTimelineClient::new(
+            remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
+        async move {
+            debug_assert_current_span_has_tenant_and_timeline_id();
+            debug!("starting index part download");
+
+            let index_part = client.download_index_file(&cancel).await;
+
+            debug!("finished index part download");
+
+            TimelinePreload {
+                client,
+                timeline_id,
+                index_part,
+            }
+        }
+    }
+
     pub(crate) async fn apply_timeline_archival_config(
         &self,
         timeline_id: TimelineId,

From c9b2ec9ff1937e8a9465f2b4abb4d1a91a059ea7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 22 Sep 2024 21:46:53 +0300
Subject: [PATCH 109/142] Check submodule forward progress (#8949)

We frequently mess up our submodule references. This adds one safeguard:
it checks that the submodule references are only updated "forwards", not
to some older commit, or a commit that's not a descended of the previous
one.

As next step, I'm thinking that we should automate things so that when
you merge a PR to the 'neon' repository that updates the submodule
references, the REL_*_STABLE_neon branches are automatically updated to
match the submodule references. That way, you never need to manually
merge PRs in the postgres repository, it's all triggered from commits in
the 'neon' repository. But that's not included here.
---
 .github/workflows/build_and_test.yml | 54 ++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c1ec3f207b..6617ca42bb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -120,6 +120,59 @@ jobs:
       - name: Run mypy to check types
         run: poetry run mypy .
 
+  # Check that the vendor/postgres-* submodules point to the
+  # corresponding REL_*_STABLE_neon branches.
+  check-submodules:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - uses: dorny/paths-filter@v3
+        id: check-if-submodules-changed
+        with:
+          filters: |
+            vendor:
+              - 'vendor/**'
+
+      - name: Check vendor/postgres-v14 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v14"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v15 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v15"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v16 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v16"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v17 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v17"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
   check-codestyle-rust:
     needs: [ check-permissions, build-build-tools-image ]
     strategy:
@@ -1154,6 +1207,7 @@ jobs:
     # Usually we do `needs: [...]`
     needs:
       - build-and-test-locally
+      - check-submodules
       - check-codestyle-python
       - check-codestyle-rust
       - promote-images

From ecd615ab6d45354d781e01f9247da2378f27b91c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:46:56 +0300
Subject: [PATCH 110/142] Update "hostname" crate

We were already building v0.4.0 as an indirect dependency, so this
avoids having to build two different versions of it.
---
 Cargo.lock | 21 ++-------------------
 Cargo.toml |  2 +-
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 136f07956f..7d3b8f2a04 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2411,17 +2411,6 @@ dependencies = [
  "digest",
 ]
 
-[[package]]
-name = "hostname"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
-dependencies = [
- "libc",
- "match_cfg",
- "winapi",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2974,12 +2963,6 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
-[[package]]
-name = "match_cfg"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
-
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -4350,7 +4333,7 @@ dependencies = [
  "hashlink",
  "hex",
  "hmac",
- "hostname 0.3.1",
+ "hostname",
  "http 1.1.0",
  "http-body-util",
  "humantime",
@@ -5400,7 +5383,7 @@ version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
 dependencies = [
- "hostname 0.4.0",
+ "hostname",
  "libc",
  "os_info",
  "rustc_version",
diff --git a/Cargo.toml b/Cargo.toml
index fd1d4e016c..1871be7f23 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,7 +95,7 @@ hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
-hostname = "0.3.1"
+hostname = "0.4"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"

From 913af442195313af7b43559da1afbe896f0886c4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:47:00 +0300
Subject: [PATCH 111/142] Update "memoffset" crate

To eliminate one version of it from our dependency tree.
---
 Cargo.lock | 11 +----------
 Cargo.toml |  2 +-
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7d3b8f2a04..bd162f09dc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3055,15 +3055,6 @@ dependencies = [
  "autocfg",
 ]
 
-[[package]]
-name = "memoffset"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
-dependencies = [
- "autocfg",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.9.0"
@@ -4123,7 +4114,7 @@ dependencies = [
  "crc32c",
  "env_logger",
  "log",
- "memoffset 0.8.0",
+ "memoffset 0.9.0",
  "once_cell",
  "postgres",
  "regex",
diff --git a/Cargo.toml b/Cargo.toml
index 1871be7f23..b7f06b2296 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,7 +113,7 @@ libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
-memoffset = "0.8"
+memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"

From 9f653893b9b57236fa184b08594c1f70c7222537 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:47:02 +0300
Subject: [PATCH 112/142] Update a few dependencies, removing some indirect
 dependencies

    cargo update ciborium iana-time-zone lazy_static schannel uuid
    cargo update hyper@0.14
    cargo update  --precise 2.9.7 ureq

It might be worthwhile just update all our dependencies at some point,
but this is aimed at pruning the dependency tree, to make the build a
little faster. That's also why I didn't update ureq to the latest
version: that would've added a dependency to yet another version of
rustls.
---
 Cargo.lock                | 275 ++++++++++----------------------------
 workspace_hack/Cargo.toml |   2 +
 2 files changed, 73 insertions(+), 204 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bd162f09dc..e4dbd8b333 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -255,12 +255,6 @@ dependencies = [
  "syn 2.0.52",
 ]
 
-[[package]]
-name = "atomic"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
-
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -295,8 +289,8 @@ dependencies = [
  "fastrand 2.0.0",
  "hex",
  "http 0.2.9",
- "hyper 0.14.26",
- "ring 0.17.6",
+ "hyper 0.14.30",
+ "ring",
  "time",
  "tokio",
  "tracing",
@@ -486,7 +480,7 @@ dependencies = [
  "once_cell",
  "p256 0.11.1",
  "percent-encoding",
- "ring 0.17.6",
+ "ring",
  "sha2",
  "subtle",
  "time",
@@ -593,7 +587,7 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "http-body 1.0.0",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper-rustls 0.24.0",
  "once_cell",
  "pin-project-lite",
@@ -684,7 +678,7 @@ dependencies = [
  "futures-util",
  "http 0.2.9",
  "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "itoa",
  "matchit 0.7.0",
  "memchr",
@@ -1089,9 +1083,9 @@ dependencies = [
 
 [[package]]
 name = "ciborium"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
 dependencies = [
  "ciborium-io",
  "ciborium-ll",
@@ -1100,18 +1094,18 @@ dependencies = [
 
 [[package]]
 name = "ciborium-io"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
 
 [[package]]
 name = "ciborium-ll"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
 dependencies = [
  "ciborium-io",
- "half 1.8.2",
+ "half",
 ]
 
 [[package]]
@@ -1224,7 +1218,7 @@ dependencies = [
  "compute_api",
  "flate2",
  "futures",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "nix 0.27.1",
  "notify",
  "num_cpus",
@@ -1330,7 +1324,7 @@ dependencies = [
  "git-version",
  "humantime",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "nix 0.27.1",
  "once_cell",
  "pageserver_api",
@@ -2304,12 +2298,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "half"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-
 [[package]]
 name = "half"
 version = "2.4.1"
@@ -2419,7 +2407,7 @@ checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
 dependencies = [
  "cfg-if",
  "libc",
- "windows 0.52.0",
+ "windows",
 ]
 
 [[package]]
@@ -2528,9 +2516,9 @@ dependencies = [
 
 [[package]]
 name = "hyper"
-version = "0.14.26"
+version = "0.14.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4"
+checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -2543,7 +2531,7 @@ dependencies = [
  "httpdate",
  "itoa",
  "pin-project-lite",
- "socket2 0.4.9",
+ "socket2",
  "tokio",
  "tower-service",
  "tracing",
@@ -2578,7 +2566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
  "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "log",
  "rustls 0.21.11",
  "rustls-native-certs 0.6.2",
@@ -2609,7 +2597,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -2628,7 +2616,7 @@ dependencies = [
  "http-body 1.0.0",
  "hyper 1.2.0",
  "pin-project-lite",
- "socket2 0.5.5",
+ "socket2",
  "tokio",
  "tower",
  "tower-service",
@@ -2637,16 +2625,16 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.56"
+version = "0.1.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c"
+checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
  "iana-time-zone-haiku",
  "js-sys",
  "wasm-bindgen",
- "windows 0.48.0",
+ "windows-core",
 ]
 
 [[package]]
@@ -2859,7 +2847,7 @@ dependencies = [
  "base64 0.21.1",
  "js-sys",
  "pem",
- "ring 0.17.6",
+ "ring",
  "serde",
  "serde_json",
  "simple_asn1",
@@ -2897,11 +2885,11 @@ dependencies = [
 
 [[package]]
 name = "lazy_static"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 dependencies = [
- "spin 0.5.2",
+ "spin",
 ]
 
 [[package]]
@@ -3634,7 +3622,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "indoc",
  "itertools 0.10.5",
  "md5",
@@ -3827,7 +3815,7 @@ dependencies = [
  "ahash",
  "bytes",
  "chrono",
- "half 2.4.1",
+ "half",
  "hashbrown 0.14.5",
  "num",
  "num-bigint",
@@ -4329,7 +4317,7 @@ dependencies = [
  "http-body-util",
  "humantime",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper 1.2.0",
  "hyper-util",
  "indexmap 2.0.1",
@@ -4374,7 +4362,7 @@ dependencies = [
  "signature 2.2.0",
  "smallvec",
  "smol_str",
- "socket2 0.5.5",
+ "socket2",
  "subtle",
  "thiserror",
  "tikv-jemalloc-ctl",
@@ -4552,7 +4540,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
  "pem",
- "ring 0.17.6",
+ "ring",
  "time",
  "yasna",
 ]
@@ -4576,7 +4564,7 @@ dependencies = [
  "rustls-pki-types",
  "ryu",
  "sha1_smol",
- "socket2 0.5.5",
+ "socket2",
  "tokio",
  "tokio-rustls 0.25.0",
  "tokio-util",
@@ -4688,7 +4676,7 @@ dependencies = [
  "futures-util",
  "http-types",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "itertools 0.10.5",
  "metrics",
  "once_cell",
@@ -4721,7 +4709,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper-rustls 0.24.0",
  "ipnet",
  "js-sys",
@@ -4879,21 +4867,6 @@ dependencies = [
  "subtle",
 ]
 
-[[package]]
-name = "ring"
-version = "0.16.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
-dependencies = [
- "cc",
- "libc",
- "once_cell",
- "spin 0.5.2",
- "untrusted 0.7.1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "ring"
 version = "0.17.6"
@@ -4903,8 +4876,8 @@ dependencies = [
  "cc",
  "getrandom 0.2.11",
  "libc",
- "spin 0.9.8",
- "untrusted 0.9.0",
+ "spin",
+ "untrusted",
  "windows-sys 0.48.0",
 ]
 
@@ -4924,7 +4897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
  "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "lazy_static",
  "percent-encoding",
  "regex",
@@ -5048,7 +5021,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
  "log",
- "ring 0.17.6",
+ "ring",
  "rustls-webpki 0.101.7",
  "sct",
 ]
@@ -5060,7 +5033,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
- "ring 0.17.6",
+ "ring",
  "rustls-pki-types",
  "rustls-webpki 0.102.2",
  "subtle",
@@ -5117,24 +5090,14 @@ version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
 
-[[package]]
-name = "rustls-webpki"
-version = "0.100.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
-dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
-]
-
 [[package]]
 name = "rustls-webpki"
 version = "0.101.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]
 
 [[package]]
@@ -5143,9 +5106,9 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
- "ring 0.17.6",
+ "ring",
  "rustls-pki-types",
- "untrusted 0.9.0",
+ "untrusted",
 ]
 
 [[package]]
@@ -5179,7 +5142,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5236,11 +5199,11 @@ dependencies = [
 
 [[package]]
 name = "schannel"
-version = "0.1.21"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys 0.42.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -5264,8 +5227,8 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]
 
 [[package]]
@@ -5686,16 +5649,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "socket2"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "socket2"
 version = "0.5.5"
@@ -5706,12 +5659,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "spin"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
-
 [[package]]
 name = "spin"
 version = "0.9.8"
@@ -5757,7 +5704,7 @@ dependencies = [
  "futures-util",
  "git-version",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5786,7 +5733,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "itertools 0.10.5",
  "lasso",
  "measured",
@@ -6202,7 +6149,7 @@ dependencies = [
  "num_cpus",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2 0.5.5",
+ "socket2",
  "tokio-macros",
  "windows-sys 0.48.0",
 ]
@@ -6262,7 +6209,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "postgres-types",
- "socket2 0.5.5",
+ "socket2",
  "tokio",
  "tokio-util",
 ]
@@ -6274,7 +6221,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
- "ring 0.17.6",
+ "ring",
  "rustls 0.22.4",
  "tokio",
  "tokio-postgres",
@@ -6408,7 +6355,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
@@ -6585,7 +6532,7 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
@@ -6688,12 +6635,6 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
 
-[[package]]
-name = "untrusted"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
-
 [[package]]
 name = "untrusted"
 version = "0.9.0"
@@ -6702,17 +6643,18 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "ureq"
-version = "2.7.1"
+version = "2.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
+checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.21.11",
- "rustls-webpki 0.100.2",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.2",
  "url",
- "webpki-roots 0.23.1",
+ "webpki-roots 0.26.1",
 ]
 
 [[package]]
@@ -6776,7 +6718,7 @@ dependencies = [
  "hex",
  "hex-literal",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
@@ -6811,11 +6753,10 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.6.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
 dependencies = [
- "atomic",
  "getrandom 0.2.11",
  "serde",
 ]
@@ -7049,15 +6990,6 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "webpki-roots"
-version = "0.23.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
-dependencies = [
- "rustls-webpki 0.100.2",
-]
-
 [[package]]
 name = "webpki-roots"
 version = "0.25.2"
@@ -7126,15 +7058,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
-[[package]]
-name = "windows"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
-dependencies = [
- "windows-targets 0.48.0",
-]
-
 [[package]]
 name = "windows"
 version = "0.52.0"
@@ -7154,21 +7077,6 @@ dependencies = [
  "windows-targets 0.52.4",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7217,12 +7125,6 @@ dependencies = [
  "windows_x86_64_msvc 0.52.4",
 ]
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.48.0"
@@ -7235,12 +7137,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.0"
@@ -7253,12 +7149,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.0"
@@ -7271,12 +7161,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.0"
@@ -7289,12 +7173,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.0"
@@ -7307,12 +7185,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.0"
@@ -7325,12 +7197,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.0"
@@ -7407,10 +7273,11 @@ dependencies = [
  "futures-util",
  "generic-array",
  "getrandom 0.2.11",
+ "half",
  "hashbrown 0.14.5",
  "hex",
  "hmac",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "indexmap 1.9.3",
  "itertools 0.10.5",
  "itertools 0.12.1",
@@ -7478,7 +7345,7 @@ dependencies = [
  "der 0.7.8",
  "hex",
  "pem",
- "ring 0.17.6",
+ "ring",
  "signature 2.2.0",
  "spki 0.7.3",
  "thiserror",
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 662916d42c..e6d21e9434 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,6 +45,7 @@ futures-io = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -106,6 +107,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }

From e16e82749f52f623c092b2ed0dd205f50dd8cdb5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:47:05 +0300
Subject: [PATCH 113/142] Remove unused crates from workspace Cargo.toml

These were not referenced in any of the other Cargo.toml files in the
workspace. They were not being built because of that, so there was
little harm in having them listed, but let's be tidy.
---
 Cargo.toml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index b7f06b2296..a788dcf3cb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,8 +76,6 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-crossbeam-deque = "0.8.5"
-crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
@@ -104,7 +102,6 @@ hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
 indoc = "2"
-inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
@@ -142,7 +139,6 @@ rpds = "0.13"
 rustc-hash = "1.1.0"
 rustls = "0.22"
 rustls-pemfile = "2"
-rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
@@ -164,7 +160,6 @@ strum_macros = "0.26"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
-task-local-extensions = "0.1.4"
 test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"

From 5432155b0d161a332d6d8ec2933a875d9959e558 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 23 Sep 2024 10:05:02 +0100
Subject: [PATCH 114/142] storcon: update compute hook state on detach (#9045)

## Problem

Previously, the storage controller may send compute notifications
containing stale pageservers (i.e. pageserver serving the shard was
detached). This happened because detaches did not update the compute
hook state.

## Summary of Changes

Update compute hook state on shard detach.

Fixes #8928
---
 storage_controller/src/compute_hook.rs | 61 ++++++++++++++++++++++++++
 storage_controller/src/reconciler.rs   | 10 +++++
 2 files changed, 71 insertions(+)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index c46539485c..bafae1f551 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -71,6 +71,37 @@ impl ComputeHookTenant {
         }
     }
 
+    fn is_sharded(&self) -> bool {
+        matches!(self, ComputeHookTenant::Sharded(_))
+    }
+
+    /// Clear compute hook state for the specified shard.
+    /// Only valid for [`ComputeHookTenant::Sharded`] instances.
+    fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) {
+        match self {
+            ComputeHookTenant::Sharded(sharded) => {
+                if sharded.stripe_size != stripe_size
+                    || sharded.shard_count != tenant_shard_id.shard_count
+                {
+                    tracing::warn!("Shard split detected while handling detach")
+                }
+
+                let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| {
+                    *shard_number == tenant_shard_id.shard_number
+                });
+
+                if let Some(shard_idx) = shard_idx {
+                    sharded.shards.remove(shard_idx);
+                } else {
+                    tracing::warn!("Shard not found while handling detach")
+                }
+            }
+            ComputeHookTenant::Unsharded(_) => {
+                unreachable!("Detach of unsharded tenants is handled externally");
+            }
+        }
+    }
+
     /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
     /// and drops existing content.
     fn update(
@@ -614,6 +645,36 @@ impl ComputeHook {
         self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
             .await
     }
+
+    /// Reflect a detach for a particular shard in the compute hook state.
+    ///
+    /// The goal is to avoid sending compute notifications with stale information (i.e.
+    /// including detach pageservers).
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
+    pub(super) fn handle_detach(
+        &self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+    ) {
+        use std::collections::hash_map::Entry;
+
+        let mut state_locked = self.state.lock().unwrap();
+        match state_locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(_) => {
+                tracing::warn!("Compute hook tenant not found for detach");
+            }
+            Entry::Occupied(mut e) => {
+                let sharded = e.get().is_sharded();
+                if !sharded {
+                    e.remove();
+                } else {
+                    e.get_mut().remove_shard(tenant_shard_id, stripe_size);
+                }
+
+                tracing::debug!("Compute hook handled shard detach");
+            }
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 83b7b2b4f2..750bcd7c01 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -820,6 +820,16 @@ impl Reconciler {
             self.location_config(&node, conf, None, false).await?;
         }
 
+        // The condition below identifies a detach. We must have no attached intent and
+        // must have been attached to something previously. Pass this information to
+        // the [`ComputeHook`] such that it can update its tenant-wide state.
+        if self.intent.attached.is_none() && !self.detach.is_empty() {
+            // TODO: Consider notifying control plane about detaches. This would avoid situations
+            // where the compute tries to start-up with a stale set of pageservers.
+            self.compute_hook
+                .handle_detach(self.tenant_shard_id, self.shard.stripe_size);
+        }
+
         failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
 
         Ok(())

From 59b4c2eaf956eb17d6360cfa94c05c830f1b535a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 23 Sep 2024 12:19:37 +0200
Subject: [PATCH 115/142] walredo: add a ping method (#8952)

Not used in production, but in benchmarks, to demonstrate minimal RTT.
(It would be nice to not have to copy the 8KiB of zeroes, but, that
would require larger protocol changes).

Found this useful in investigation
https://github.com/neondatabase/neon/pull/8952.
---
 pageserver/benches/bench_walredo.rs        | 136 +++++++++++++--------
 pageserver/src/walredo.rs                  |  30 +++++
 pageserver/src/walredo/process.rs          |  21 ++++
 pageserver/src/walredo/process/protocol.rs |   5 +
 pgxn/neon_walredo/walredoproc.c            |  36 ++++++
 5 files changed, 176 insertions(+), 52 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index edc09d0bf2..45936cb3fa 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,7 +1,7 @@
 //! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
 //! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
+//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
 //! - `n_redos` => number of times the benchmark shell execute the `redo_work`
 //! - `nclients` => number of clients (more on this shortly).
 //!
@@ -10,7 +10,7 @@
 //! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
 //! We exercise the following combinations:
-//! - `redo_work = short / medium``
+//! - `redo_work = ping / short / medium``
 //! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
 //! We let `criterion` determine the `n_redos` using `iter_custom`.
@@ -27,33 +27,43 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-15 on i3en.3xlarge
+//! 2024-09-18 on im4gn.2xlarge
 //!
 //! ```text
-//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! ping/1                  time:   [21.789 µs 21.918 µs 22.078 µs]
+//! ping/2                  time:   [27.686 µs 27.812 µs 27.970 µs]
+//! ping/4                  time:   [35.468 µs 35.671 µs 35.926 µs]
+//! ping/8                  time:   [59.682 µs 59.987 µs 60.363 µs]
+//! ping/16                 time:   [101.79 µs 102.37 µs 103.08 µs]
+//! ping/32                 time:   [184.18 µs 185.15 µs 186.36 µs]
+//! ping/64                 time:   [349.86 µs 351.45 µs 353.47 µs]
+//! ping/128                time:   [684.53 µs 687.98 µs 692.17 µs]
+//! short/1                 time:   [31.833 µs 32.126 µs 32.428 µs]
+//! short/2                 time:   [35.558 µs 35.756 µs 35.992 µs]
+//! short/4                 time:   [44.850 µs 45.138 µs 45.484 µs]
+//! short/8                 time:   [65.985 µs 66.379 µs 66.853 µs]
+//! short/16                time:   [127.06 µs 127.90 µs 128.87 µs]
+//! short/32                time:   [252.98 µs 254.70 µs 256.73 µs]
+//! short/64                time:   [497.13 µs 499.86 µs 503.26 µs]
+//! short/128               time:   [987.46 µs 993.45 µs 1.0004 ms]
+//! medium/1                time:   [137.91 µs 138.55 µs 139.35 µs]
+//! medium/2                time:   [192.00 µs 192.91 µs 194.07 µs]
+//! medium/4                time:   [389.62 µs 391.55 µs 394.01 µs]
+//! medium/8                time:   [776.80 µs 780.33 µs 784.77 µs]
+//! medium/16               time:   [1.5323 ms 1.5383 ms 1.5459 ms]
+//! medium/32               time:   [3.0120 ms 3.0226 ms 3.0350 ms]
+//! medium/64               time:   [5.7405 ms 5.7787 ms 5.8166 ms]
+//! medium/128              time:   [10.412 ms 10.574 ms 10.718 ms]
 //! ```
 
 use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
+use once_cell::sync::Lazy;
 use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
+    future::Future,
     sync::Arc,
     time::{Duration, Instant},
 };
@@ -61,40 +71,59 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
+    macro_rules! bench_group {
+        ($name:expr, $redo_work:expr) => {{
+            let name: &str = $name;
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(name);
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
+                    },
+                );
+            }
+        }};
     }
+    //
+    // benchmark the protocol implementation
+    //
+    let pg_version = 14;
+    bench_group!(
+        "ping",
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let _: () = mgr.ping(pg_version).await.unwrap();
+        })
+    );
+    //
+    // benchmarks with actual record redo
+    //
+    let make_redo_work = |req: &'static Request| {
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let page = req.execute(&mgr).await.unwrap();
+            assert_eq!(page.remaining(), 8192);
+        })
+    };
+    bench_group!("short", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
+        make_redo_work(&REQUEST)
+    });
+    bench_group!("medium", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
+        make_redo_work(&REQUEST)
+    });
 }
 criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
@@ -135,17 +164,20 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
     })
 }
 
-async fn client(
+async fn client<F, Fut>(
     mgr: Arc<PostgresRedoManager>,
     start: Arc<Barrier>,
-    redo_work: Arc<Request>,
+    redo_work: Arc<F>,
     n_redos: u64,
-) -> Duration {
+) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
     start.wait().await;
     let start = Instant::now();
     for _ in 0..n_redos {
-        let page = redo_work.execute(&mgr).await.unwrap();
-        assert_eq!(page.remaining(), 8192);
+        redo_work(Arc::clone(&mgr)).await;
         // The real pageserver will rarely if ever do 2 walredos in a row without
         // yielding to the executor.
         tokio::task::yield_now().await;
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 0fe7def8b0..a1c9fc5651 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -205,6 +205,22 @@ impl PostgresRedoManager {
         }
     }
 
+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
+        self.do_with_walredo_process(pg_version, |proc| async move {
+            proc.ping(Duration::from_secs(1))
+                .await
+                .map_err(Error::Other)
+        })
+        .await
+    }
+
     pub fn status(&self) -> WalRedoManagerStatus {
         WalRedoManagerStatus {
             last_redo_at: {
@@ -297,6 +313,9 @@ impl PostgresRedoManager {
         }
     }
 
+    /// # Cancel-Safety
+    ///
+    /// This method is cancel-safe iff `closure` is cancel-safe.
     async fn do_with_walredo_process<
         F: FnOnce(Arc<Process>) -> Fut,
         Fut: Future<Output = Result<O, Error>>,
@@ -537,6 +556,17 @@ mod tests {
     use tracing::Instrument;
     use utils::{id::TenantId, lsn::Lsn};
 
+    #[tokio::test]
+    async fn test_ping() {
+        let h = RedoHarness::new().unwrap();
+
+        h.manager
+            .ping(14)
+            .instrument(h.span())
+            .await
+            .expect("ping should work");
+    }
+
     #[tokio::test]
     async fn short_v14_redo() {
         let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 9140d4f6aa..f3197e68b5 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -6,6 +6,7 @@ use self::no_leak_child::NoLeakChild;
 use crate::{
     config::PageServerConf,
     metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    page_cache::PAGE_SZ,
     span::debug_assert_current_span_has_tenant_id,
     walrecord::NeonWalRecord,
 };
@@ -237,6 +238,26 @@ impl WalRedoProcess {
         res
     }
 
+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
+        let mut writebuf: Vec<u8> = Vec::with_capacity(4);
+        protocol::build_ping_msg(&mut writebuf);
+        let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo ping timed out");
+        };
+        let response = res?;
+        if response.len() != PAGE_SZ {
+            anyhow::bail!(
+                "WAL redo ping response should respond with page-sized response: {}",
+                response.len()
+            );
+        }
+        Ok(())
+    }
+
     /// # Cancel-Safety
     ///
     /// When not polled to completion (e.g. because in `tokio::select!` another
diff --git a/pageserver/src/walredo/process/protocol.rs b/pageserver/src/walredo/process/protocol.rs
index b703344cc8..de3ca8741b 100644
--- a/pageserver/src/walredo/process/protocol.rs
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -55,3 +55,8 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
     tag.ser_into(buf)
         .expect("serialize BufferTag should always succeed");
 }
+
+pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
+    buf.put_u8(b'H');
+    buf.put_u32(4);
+}
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 219ca85207..f98aa1cbe7 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -24,6 +24,7 @@
  * PushPage ('P'): Copy a page image (in the payload) to buffer cache
  * ApplyRecord ('A'): Apply a WAL record (in the payload)
  * GetPage ('G'): Return a page image from buffer cache.
+ * Ping ('H'): Return the input message.
  *
  * Currently, you only get a response to GetPage requests; the response is
  * simply a 8k page, without any headers. Errors are logged to stderr.
@@ -133,6 +134,7 @@ static void ApplyRecord(StringInfo input_message);
 static void apply_error_callback(void *arg);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
+static void Ping(StringInfo input_message);
 static ssize_t buffered_read(void *buf, size_t count);
 static void CreateFakeSharedMemoryAndSemaphores();
 
@@ -394,6 +396,10 @@ WalRedoMain(int argc, char *argv[])
 				GetPage(&input_message);
 				break;
 
+			case 'H': 			/* Ping */
+				Ping(&input_message);
+				break;
+
 				/*
 				 * EOF means we're done. Perform normal shutdown.
 				 */
@@ -1057,6 +1063,36 @@ GetPage(StringInfo input_message)
 }
 
 
+static void
+Ping(StringInfo input_message)
+{
+	int			tot_written;
+	/* Response: the input message */
+	tot_written = 0;
+	do {
+		ssize_t		rc;
+		/* We don't need alignment, but it's bad practice to use char[BLCKSZ] */
+#if PG_VERSION_NUM >= 160000
+		static const PGIOAlignedBlock response;
+#else
+		static const PGAlignedBlock response;
+#endif
+		rc = write(STDOUT_FILENO, &response.data[tot_written], BLCKSZ - tot_written);
+		if (rc < 0) {
+			/* If interrupted by signal, just retry */
+			if (errno == EINTR)
+				continue;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to stdout: %m")));
+		}
+		tot_written += rc;
+	} while (tot_written < BLCKSZ);
+
+	elog(TRACE, "Page sent back for ping");
+}
+
+
 /* Buffer used by buffered_read() */
 static char stdin_buf[16 * 1024];
 static size_t stdin_len = 0;	/* # of bytes in buffer */

From 4d5add9ca03462f14b6e63df55e6da6ed32c3d4d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 23 Sep 2024 15:05:22 +0200
Subject: [PATCH 116/142] compact_level0_phase1: remove final traces of value
 access mode config (#8935)

refs https://github.com/neondatabase/neon/issues/8184
stacked atop https://github.com/neondatabase/neon/pull/8934

This PR changes from ignoring the config field to rejecting configs that
contain it.

PR https://github.com/neondatabase/infra/pull/1903 removes the field
usage from `pageserver.toml`.

It rolls into prod sooner or in the same release as this PR.
---
 libs/pageserver_api/src/config.rs |  4 ----
 pageserver/src/config.rs          | 11 -----------
 2 files changed, 15 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 61e32bc9ab..95310fdbac 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,9 +104,6 @@ pub struct ConfigToml {
     pub image_compression: ImageCompressionAlgorithm,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
-    #[serde(skip_serializing)]
-    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
-    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
     pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
     pub io_buffer_alignment: usize,
 }
@@ -384,7 +381,6 @@ impl Default for ConfigToml {
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
-            compact_level0_phase1_value_access: Default::default(),
             virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
 
             io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8567c6aa52..e15f1c791b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -324,7 +324,6 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access: _,
             l0_flush,
             virtual_file_direct_io,
             concurrent_tenant_warmup,
@@ -535,16 +534,6 @@ mod tests {
             .expect("parse_and_validate");
     }
 
-    #[test]
-    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
-        let input = indoc::indoc! {r#"
-            [compact_level0_phase1_value_access]
-            mode = "streaming-kmerge"
-            validate = "key-lsn-value"
-        "#};
-        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
-    }
-
     /// If there's a typo in the pageserver config, we'd rather catch that typo
     /// and fail pageserver startup than silently ignoring the typo, leaving whoever
     /// made it in the believe that their config change is effective.

From f446e08fb8ac68d5957b239d7f11c8f99536c960 Mon Sep 17 00:00:00 2001
From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:53:06 +0300
Subject: [PATCH 117/142] change HTTP method to comply with spec (#9100)

There is discrepancy with the spec, it has PUT
---
 pageserver/client/src/mgmt_api.rs           | 2 +-
 pageserver/src/http/routes.rs               | 2 +-
 storage_controller/src/http.rs              | 2 +-
 storage_controller/src/pageserver_client.rs | 2 +-
 test_runner/fixtures/pageserver/http.py     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index a68f45a6d9..2d95ac42e6 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -432,7 +432,7 @@ impl Client {
             self.mgmt_api_endpoint
         );
 
-        self.request(Method::POST, &uri, req)
+        self.request(Method::PUT, &uri, req)
             .await?
             .json()
             .await
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d645f3b7b6..6a10d4fb1c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2955,7 +2955,7 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
             |r| api_handler(r, timeline_preserve_initdb_handler),
         )
-        .post(
+        .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
             |r| api_handler(r, timeline_archival_config_handler),
         )
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 1745bf5575..95e4a469ac 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1849,7 +1849,7 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
-        .post(
+        .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
             |r| {
                 tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 961a1f78dd..b19cbc4fa3 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -238,7 +238,7 @@ impl PageserverClient {
     ) -> Result<()> {
         measured_request!(
             "timeline_archival_config",
-            crate::metrics::Method::Post,
+            crate::metrics::Method::Put,
             &self.node_id_label,
             self.inner
                 .timeline_archival_config(tenant_shard_id, timeline_id, req)
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 582f9c0264..0dd557c59f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -631,7 +631,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         log.info(
             f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
         )
-        res = self.post(
+        res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
             json=config,
         )

From 29699529dfdd4642d71e018047071a01dacb0cf0 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 23 Sep 2024 12:30:44 -0400
Subject: [PATCH 118/142] feat(pageserver): filter keys with gc-compaction
 (#9004)

Part of https://github.com/neondatabase/neon/issues/8002

Close https://github.com/neondatabase/neon/issues/8920

Legacy compaction (as well as gc-compaction) rely on the GC process to
remove unused layer files, but this relies on many factors (i.e., key
partition) to ensure data in a dropped table can be eventually removed.

In gc-compaction, we consider the keyspace information when doing the
compaction process. If a key is not in the keyspace, we will skip that
key and not include it in the final output.

However, this is not easy to implement because gc-compaction considers
branch points (i.e., retain_lsns) and the retained keyspaces could
change across different LSNs. Therefore, for now, we only remove aux v1
keys in the compaction process.

## Summary of changes

* Add `FilterIterator` to filter out keys.
* Integrate `FilterIterator` with gc-compaction.
* Add `collect_gc_compaction_keyspace` for a spec of keyspaces that can
be retained during the gc-compaction process.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs           |  30 +++
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../tenant/storage_layer/filter_iterator.rs   | 205 ++++++++++++++++++
 pageserver/src/tenant/timeline/compaction.rs  |   9 +-
 4 files changed, 244 insertions(+), 2 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/filter_iterator.rs

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 5f8766ca2c..7aa313f031 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -840,6 +840,36 @@ impl Timeline {
         Ok(total_size * BLCKSZ as u64)
     }
 
+    /// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
+    /// for gc-compaction.
+    ///
+    /// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
+    /// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
+    /// be kept only for a specific range of LSN.
+    ///
+    /// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
+    /// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
+    /// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
+    /// determine which keys to retain/drop for gc-compaction.
+    ///
+    /// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
+    /// to be retained for each of the branch LSN.
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
+    pub(crate) async fn collect_gc_compaction_keyspace(
+        &self,
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+        let metadata_key_begin = Key::metadata_key_range().start;
+        let aux_v1_key = AUX_FILES_KEY;
+        let dense_keyspace = KeySpace {
+            ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
+        };
+        Ok((
+            dense_keyspace,
+            SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
+        ))
+    }
+
     ///
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index cd252aa371..99bd0ece57 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,13 +1,13 @@
 //! Common traits and structs for layers
 
 pub mod delta_layer;
+pub mod filter_iterator;
 pub mod image_layer;
 pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
-
 pub mod split_writer;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
new file mode 100644
index 0000000000..f45dd4b801
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -0,0 +1,205 @@
+use std::ops::Range;
+
+use anyhow::bail;
+use pageserver_api::{
+    key::Key,
+    keyspace::{KeySpace, SparseKeySpace},
+};
+use utils::lsn::Lsn;
+
+use crate::repository::Value;
+
+use super::merge_iterator::MergeIterator;
+
+/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
+///
+/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
+/// to be retained.
+pub struct FilterIterator<'a> {
+    inner: MergeIterator<'a>,
+    retain_key_filters: Vec<Range<Key>>,
+    current_filter_idx: usize,
+}
+
+impl<'a> FilterIterator<'a> {
+    pub fn create(
+        inner: MergeIterator<'a>,
+        dense_keyspace: KeySpace,
+        sparse_keyspace: SparseKeySpace,
+    ) -> anyhow::Result<Self> {
+        let mut retain_key_filters = Vec::new();
+        retain_key_filters.extend(dense_keyspace.ranges);
+        retain_key_filters.extend(sparse_keyspace.0.ranges);
+        retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
+        // Verify key filters are non-overlapping and sorted
+        for window in retain_key_filters.windows(2) {
+            if window[0].end > window[1].start {
+                bail!(
+                    "Key filters are overlapping: {:?} and {:?}",
+                    window[0],
+                    window[1]
+                );
+            }
+        }
+        Ok(Self {
+            inner,
+            retain_key_filters,
+            current_filter_idx: 0,
+        })
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(item) = self.inner.next().await? {
+            while self.current_filter_idx < self.retain_key_filters.len()
+                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+            {
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                    ^ current filter
+                self.current_filter_idx += 1;
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                                        ^ current filter
+            }
+            if self.current_filter_idx >= self.retain_key_filters.len() {
+                // We already exhausted all filters, so we should return now
+                // [filter region] [filter region] [filter region]
+                //                                                    ^ item
+                //                                                 ^ current filter (nothing)
+                return Ok(None);
+            }
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+                // [filter region]    [filter region]     [filter region]
+                //                                              ^ item
+                //                                        ^ current filter
+                return Ok(Some(item));
+            }
+            // If the key is not contained in the key retaining filters, continue to the next item.
+            // [filter region]    [filter region]     [filter region]
+            //                                     ^ item
+            //                                        ^ current filter
+        }
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use itertools::Itertools;
+    use pageserver_api::key::Key;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::delta_layer::test::produce_delta_layer,
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    async fn assert_filter_iter_equal(
+        filter_iter: &mut FilterIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = filter_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn filter_keyspace_iterator() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 100;
+        let test_deltas1 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(5)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(110),
+                    get_key(1000)..get_key(2000),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[5..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..100].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(0)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(95),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[0..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..95].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+    }
+}
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d1567b6b39..6edc28a11b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -31,6 +31,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
+use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
     SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
@@ -1772,6 +1773,7 @@ impl Timeline {
             gc_cutoff,
             lowest_retain_lsn
         );
+
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
         let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
@@ -1820,7 +1822,12 @@ impl Timeline {
                 image_layers.push(layer);
             }
         }
-        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
+        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let mut merge_iter = FilterIterator::create(
+            MergeIterator::create(&delta_layers, &image_layers, ctx),
+            dense_ks,
+            sparse_ks,
+        )?;
         // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
         // Data of the same key.
         let mut accumulated_values = Vec::new();

From df3996265f423a727482c46eefed9e8fd266af7d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 21:10:22 +0300
Subject: [PATCH 119/142] test: Downgrade info message on removing empty
 directories (#9093)

It was pretty noisy. It changed from debug to info level in commit
78938d1b59, but I believe that was not purpose.
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fc83cf3f7c..55c1423ed0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -849,7 +849,7 @@ class NeonEnvBuilder:
 
         for directory_to_clean in reversed(directories_to_clean):
             if not os.listdir(directory_to_clean):
-                log.info(f"Removing empty directory {directory_to_clean}")
+                log.debug(f"Removing empty directory {directory_to_clean}")
                 try:
                     directory_to_clean.rmdir()
                 except Exception as e:

From 263dfba6eeef448864dba151e2d8d34a418b9629 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 21:28:50 +0300
Subject: [PATCH 120/142] Add views for metrics about pageserver requests
 (#9008)

The metrics include a histogram of how long we need to wait for a
GetPage request, number of reconnects, and number of requests among
other things.

The metrics are not yet exported anywhere, but you can query them
manually.

Note: This does *not* bump the default version of the 'neon' extension. We
will do that later, as a separate PR. The reason is that this allows us to roll back
the compute image smoothly, if necessary. Once the image that includes the
new extension .so file with the new functions has been rolled out, and we're
confident that we don't need to roll back the image anymore, we can change
default extension version and actually start using the new functions and views.

This is what the view looks like:

```
postgres=# select * from neon_perf_counters ;
                metric                 | bucket_le |  value
---------------------------------------+-----------+----------
 getpage_wait_seconds_count            |           |      300
 getpage_wait_seconds_sum              |           | 0.048506
 getpage_wait_seconds_bucket           |     2e-05 |        0
 getpage_wait_seconds_bucket           |     3e-05 |        0
 getpage_wait_seconds_bucket           |     6e-05 |       71
 getpage_wait_seconds_bucket           |    0.0001 |      124
 getpage_wait_seconds_bucket           |    0.0002 |      248
 getpage_wait_seconds_bucket           |    0.0003 |      279
 getpage_wait_seconds_bucket           |    0.0006 |      297
 getpage_wait_seconds_bucket           |     0.001 |      298
 getpage_wait_seconds_bucket           |     0.002 |      298
 getpage_wait_seconds_bucket           |     0.003 |      298
 getpage_wait_seconds_bucket           |     0.006 |      300
 getpage_wait_seconds_bucket           |      0.01 |      300
 getpage_wait_seconds_bucket           |      0.02 |      300
 getpage_wait_seconds_bucket           |      0.03 |      300
 getpage_wait_seconds_bucket           |      0.06 |      300
 getpage_wait_seconds_bucket           |       0.1 |      300
 getpage_wait_seconds_bucket           |       0.2 |      300
 getpage_wait_seconds_bucket           |       0.3 |      300
 getpage_wait_seconds_bucket           |       0.6 |      300
 getpage_wait_seconds_bucket           |         1 |      300
 getpage_wait_seconds_bucket           |         2 |      300
 getpage_wait_seconds_bucket           |         3 |      300
 getpage_wait_seconds_bucket           |         6 |      300
 getpage_wait_seconds_bucket           |        10 |      300
 getpage_wait_seconds_bucket           |        20 |      300
 getpage_wait_seconds_bucket           |        30 |      300
 getpage_wait_seconds_bucket           |        60 |      300
 getpage_wait_seconds_bucket           |       100 |      300
 getpage_wait_seconds_bucket           |  Infinity |      300
 getpage_prefetch_requests_total       |           |       69
 getpage_sync_requests_total           |           |      231
 getpage_prefetch_misses_total         |           |        0
 getpage_prefetch_discards_total       |           |        0
 pageserver_requests_sent_total        |           |      323
 pageserver_requests_disconnects_total |           |        0
 pageserver_send_flushes_total         |           |      323
 file_cache_hits_total                 |           |        0
(39 rows)
```
---
 pgxn/neon/Makefile                          |   4 +-
 pgxn/neon/libpagestore.c                    |  10 +-
 pgxn/neon/neon--1.4--1.5.sql                |  39 +++
 pgxn/neon/neon--1.5--1.4.sql                |   4 +
 pgxn/neon/neon.control                      |   2 +
 pgxn/neon/neon_perf_counters.c              | 261 ++++++++++++++++++++
 pgxn/neon/neon_perf_counters.h              | 111 +++++++++
 pgxn/neon/neon_pgversioncompat.c            |  44 ++++
 pgxn/neon/neon_pgversioncompat.h            |   6 +
 pgxn/neon/pagestore_smgr.c                  |  47 ++--
 test_runner/regress/test_compute_metrics.py |  21 ++
 test_runner/regress/test_neon_extension.py  |   4 +-
 12 files changed, 533 insertions(+), 20 deletions(-)
 create mode 100644 pgxn/neon/neon--1.4--1.5.sql
 create mode 100644 pgxn/neon/neon--1.5--1.4.sql
 create mode 100644 pgxn/neon/neon_perf_counters.c
 create mode 100644 pgxn/neon/neon_perf_counters.h
 create mode 100644 pgxn/neon/neon_pgversioncompat.c
 create mode 100644 test_runner/regress/test_compute_metrics.py

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 3b755bb042..ddc8155ff3 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,6 +9,8 @@ OBJS = \
 	hll.o \
 	libpagestore.o \
 	neon.o \
+	neon_pgversioncompat.o \
+	neon_perf_counters.o \
 	neon_utils.o \
 	neon_walreader.o \
 	pagestore_smgr.o \
@@ -23,7 +25,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index df7000acc0..07a19a7114 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -30,6 +30,7 @@
 #include "utils/guc.h"
 
 #include "neon.h"
+#include "neon_perf_counters.h"
 #include "neon_utils.h"
 #include "pagestore_client.h"
 #include "walproposer.h"
@@ -331,6 +332,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
 	}
 	if (shard->conn)
 	{
+		MyNeonCounters->pageserver_disconnects_total++;
 		PQfinish(shard->conn);
 		shard->conn = NULL;
 	}
@@ -737,6 +739,8 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn;
 
+	MyNeonCounters->pageserver_requests_sent_total++;
+
 	/* If the connection was lost for some reason, reconnect */
 	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
 	{
@@ -889,6 +893,7 @@ pageserver_flush(shardno_t shard_no)
 	}
 	else
 	{
+		MyNeonCounters->pageserver_send_flushes_total++;
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -922,7 +927,7 @@ check_neon_id(char **newval, void **extra, GucSource source)
 static Size
 PagestoreShmemSize(void)
 {
-	return sizeof(PagestoreShmemState);
+	return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
 }
 
 static bool
@@ -941,6 +946,9 @@ PagestoreShmemInit(void)
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
+
+	NeonPerfCountersShmemInit();
+
 	LWLockRelease(AddinShmemInitLock);
 	return found;
 }
diff --git a/pgxn/neon/neon--1.4--1.5.sql b/pgxn/neon/neon--1.4--1.5.sql
new file mode 100644
index 0000000000..a1db7bf1b1
--- /dev/null
+++ b/pgxn/neon/neon--1.4--1.5.sql
@@ -0,0 +1,39 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
+
+
+CREATE FUNCTION get_backend_perf_counters()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'neon_get_backend_perf_counters'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE FUNCTION get_perf_counters()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'neon_get_perf_counters'
+LANGUAGE C PARALLEL SAFE;
+
+-- Show various metrics, for each backend. Note that the values are not reset
+-- when a backend exits. When a new backend starts with the backend ID, it will
+-- continue accumulating the values from where the old backend left. If you are
+-- only interested in the changes from your own session, store the values at the
+-- beginning of the session somewhere, and subtract them on subsequent calls.
+--
+-- For histograms, 'bucket_le' is the upper bound of the histogram bucket.
+CREATE VIEW neon_backend_perf_counters AS
+  SELECT P.procno, P.pid, P.metric, P.bucket_le, P.value
+  FROM get_backend_perf_counters() AS P (
+    procno integer,
+    pid integer,
+    metric text,
+    bucket_le float8,
+    value float8
+  );
+
+-- Summary across all backends. (This could also be implemented with
+-- an aggregate query over neon_backend_perf_counters view.)
+CREATE VIEW neon_perf_counters AS
+  SELECT P.metric, P.bucket_le, P.value
+  FROM get_perf_counters() AS P (
+    metric text,
+    bucket_le float8,
+    value float8
+  );
diff --git a/pgxn/neon/neon--1.5--1.4.sql b/pgxn/neon/neon--1.5--1.4.sql
new file mode 100644
index 0000000000..7939fd8aa9
--- /dev/null
+++ b/pgxn/neon/neon--1.5--1.4.sql
@@ -0,0 +1,4 @@
+DROP VIEW IF EXISTS neon_perf_counters;
+DROP VIEW IF EXISTS neon_backend_perf_counters;
+DROP FUNCTION IF EXISTS get_perf_counters();
+DROP FUNCTION IF EXISTS get_backend_perf_counters();
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index 03bdb9a0b4..0b36bdbb65 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,5 +1,7 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
+# TODO: bump default version to 1.5, after we are certain that we don't
+# need to rollback the compute image
 default_version = '1.4'
 module_pathname = '$libdir/neon'
 relocatable = true
diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
new file mode 100644
index 0000000000..3e86d5b262
--- /dev/null
+++ b/pgxn/neon/neon_perf_counters.c
@@ -0,0 +1,261 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_perf_counters.c
+ *	  Collect statistics about Neon I/O
+ *
+ * Each backend has its own set of counters in shared memory.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+
+#include "neon_perf_counters.h"
+#include "neon_pgversioncompat.h"
+
+neon_per_backend_counters *neon_per_backend_counters_shared;
+
+Size
+NeonPerfCountersShmemSize(void)
+{
+	Size		size = 0;
+
+	size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters)));
+
+	return size;
+}
+
+bool
+NeonPerfCountersShmemInit(void)
+{
+	bool		found;
+
+	neon_per_backend_counters_shared =
+		ShmemInitStruct("Neon perf counters",
+						mul_size(MaxBackends,
+								 sizeof(neon_per_backend_counters)),
+						&found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		/* shared memory is initialized to zeros, so nothing to do here */
+	}
+}
+
+/*
+ * Count a GetPage wait operation.
+ */
+void
+inc_getpage_wait(uint64 latency_us)
+{
+	int			lo = 0;
+	int			hi = NUM_GETPAGE_WAIT_BUCKETS - 1;
+
+	/* Find the right bucket with binary search */
+	while (lo < hi)
+	{
+		int			mid = (lo + hi) / 2;
+
+		if (latency_us < getpage_wait_bucket_thresholds[mid])
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+	MyNeonCounters->getpage_wait_us_bucket[lo]++;
+	MyNeonCounters->getpage_wait_us_sum += latency_us;
+	MyNeonCounters->getpage_wait_us_count++;
+}
+
+/*
+ * Support functions for the views, neon_backend_perf_counters and
+ * neon_perf_counters.
+ */
+
+typedef struct
+{
+	char	   *name;
+	bool		is_bucket;
+	double		bucket_le;
+	double		value;
+} metric_t;
+
+static metric_t *
+neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
+{
+#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8)
+	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
+	uint64		bucket_accum;
+	int			i = 0;
+	Datum		getpage_wait_str;
+
+	metrics[i].name = "getpage_wait_seconds_count";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_wait_us_count;
+	i++;
+	metrics[i].name = "getpage_wait_seconds_sum";
+	metrics[i].is_bucket = false;
+	metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0;
+	i++;
+
+	bucket_accum = 0;
+	for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+	{
+		uint64		threshold = getpage_wait_bucket_thresholds[bucketno];
+
+		bucket_accum += counters->getpage_wait_us_bucket[bucketno];
+
+		metrics[i].name = "getpage_wait_seconds_bucket";
+		metrics[i].is_bucket = true;
+		metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
+		metrics[i].value = (double) bucket_accum;
+		i++;
+	}
+	metrics[i].name = "getpage_prefetch_requests_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_requests_total;
+	i++;
+	metrics[i].name = "getpage_sync_requests_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_sync_requests_total;
+	i++;
+	metrics[i].name = "getpage_prefetch_misses_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_misses_total;
+	i++;
+	metrics[i].name = "getpage_prefetch_discards_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_discards_total;
+	i++;
+	metrics[i].name = "pageserver_requests_sent_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_requests_sent_total;
+	i++;
+	metrics[i].name = "pageserver_requests_disconnects_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_disconnects_total;
+	i++;
+	metrics[i].name = "pageserver_send_flushes_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_send_flushes_total;
+	i++;
+	metrics[i].name = "file_cache_hits_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->file_cache_hits_total;
+	i++;
+
+	Assert(i == NUM_METRICS);
+
+	/* NULL entry marks end of array */
+	metrics[i].name = NULL;
+	metrics[i].value = 0;
+
+	return metrics;
+}
+
+/*
+ * Write metric to three output Datums
+ */
+static void
+metric_to_datums(metric_t *m, Datum *values, bool *nulls)
+{
+	values[0] = CStringGetTextDatum(m->name);
+	nulls[0] = false;
+	if (m->is_bucket)
+	{
+		values[1] = Float8GetDatum(m->bucket_le);
+		nulls[1] = false;
+	}
+	else
+	{
+		values[1] = (Datum) 0;
+		nulls[1] = true;
+	}
+	values[2] = Float8GetDatum(m->value);
+	nulls[2] = false;
+}
+
+PG_FUNCTION_INFO_V1(neon_get_backend_perf_counters);
+Datum
+neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[5];
+	bool		nulls[5];
+
+	/* We put all the tuples into a tuplestore in one go. */
+	InitMaterializedSRF(fcinfo, 0);
+
+	for (int procno = 0; procno < MaxBackends; procno++)
+	{
+		PGPROC	   *proc = GetPGProcByNumber(procno);
+		int			pid = proc->pid;
+		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
+		metric_t   *metrics = neon_perf_counters_to_metrics(counters);
+
+		values[0] = Int32GetDatum(procno);
+		nulls[0] = false;
+		values[1] = Int32GetDatum(pid);
+		nulls[1] = false;
+
+		for (int i = 0; metrics[i].name != NULL; i++)
+		{
+			metric_to_datums(&metrics[i], &values[2], &nulls[2]);
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+		}
+
+		pfree(metrics);
+	}
+
+	return (Datum) 0;
+}
+
+PG_FUNCTION_INFO_V1(neon_get_perf_counters);
+Datum
+neon_get_perf_counters(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[3];
+	bool		nulls[3];
+	Datum		getpage_wait_str;
+	neon_per_backend_counters totals = {0};
+	metric_t   *metrics;
+
+	/* We put all the tuples into a tuplestore in one go. */
+	InitMaterializedSRF(fcinfo, 0);
+
+	/* Aggregate the counters across all backends */
+	for (int procno = 0; procno < MaxBackends; procno++)
+	{
+		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
+
+		totals.getpage_wait_us_count += counters->getpage_wait_us_count;
+		totals.getpage_wait_us_sum += counters->getpage_wait_us_sum;
+		for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+			totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno];
+		totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
+		totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
+		totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
+		totals.getpage_prefetch_discards_total += counters->getpage_prefetch_discards_total;
+		totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total;
+		totals.pageserver_disconnects_total += counters->pageserver_disconnects_total;
+		totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total;
+		totals.file_cache_hits_total += counters->file_cache_hits_total;
+	}
+
+	metrics = neon_perf_counters_to_metrics(&totals);
+	for (int i = 0; metrics[i].name != NULL; i++)
+	{
+		metric_to_datums(&metrics[i], &values[0], &nulls[0]);
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+	pfree(metrics);
+
+	return (Datum) 0;
+}
diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h
new file mode 100644
index 0000000000..ae35e8c3a5
--- /dev/null
+++ b/pgxn/neon/neon_perf_counters.h
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_perf_counters.h
+ *	  Performance counters for neon storage requests
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef NEON_PERF_COUNTERS_H
+#define NEON_PERF_COUNTERS_H
+
+#if PG_VERSION_NUM >= 170000
+#include "storage/procnumber.h"
+#else
+#include "storage/backendid.h"
+#include "storage/proc.h"
+#endif
+
+static const uint64 getpage_wait_bucket_thresholds[] = {
+	      20,       30,       60,       100,  /* 0      -  100 us */
+	     200,      300,      600,	   1000,  /* 100 us - 1 ms */
+	    2000,     3000,     6000,     10000,  /* 1 ms   - 10 ms */
+	   20000,    30000,    60000,    100000,  /* 10 ms  - 100 ms */
+	  200000,   300000,   600000,   1000000,  /* 100 ms - 1 s */
+	 2000000,  3000000,  6000000,  10000000,  /* 1 s - 10 s */
+    20000000, 30000000, 60000000, 100000000,  /* 10 s - 100 s */
+	UINT64_MAX,
+};
+#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds))
+
+typedef struct
+{
+	/*
+	 * Histogram for how long an smgrread() request needs to wait for response
+	 * from pageserver. When prefetching is effective, these wait times can be
+	 * lower than the network latency to the pageserver, even zero, if the
+	 * page is already readily prefetched whenever we need to read a page.
+	 *
+	 * Note: we accumulate these in microseconds, because that's convenient in
+	 * the backend, but the 'neon_backend_perf_counters' view will convert
+	 * them to seconds, to make them more idiomatic as prometheus metrics.
+	 */
+	uint64		getpage_wait_us_count;
+	uint64		getpage_wait_us_sum;
+	uint64		getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS];
+
+	/*
+	 * Total number of speculative prefetch Getpage requests and synchronous
+	 * GetPage requests sent.
+	 */
+	uint64		getpage_prefetch_requests_total;
+	uint64		getpage_sync_requests_total;
+
+	/* XXX: It's not clear to me when these misses happen. */
+	uint64		getpage_prefetch_misses_total;
+
+	/*
+	 * Number of prefetched responses that were discarded becuase the
+	 * prefetched page was not needed or because it was concurrently fetched /
+	 * modified by another backend.
+	 */
+	uint64		getpage_prefetch_discards_total;
+
+	/*
+	 * Total number of requests send to pageserver. (prefetch_requests_total
+	 * and sync_request_total count only GetPage requests, this counts all
+	 * request types.)
+	 */
+	uint64		pageserver_requests_sent_total;
+
+	/*
+	 * Number of times the connection to the pageserver was lost and the
+	 * backend had to reconnect. Note that this doesn't count the first
+	 * connection in each backend, only reconnects.
+	 */
+	uint64		pageserver_disconnects_total;
+
+	/*
+	 * Number of network flushes to the pageserver. Synchronous requests are
+	 * flushed immediately, but when prefetching requests are sent in batches,
+	 * this can be smaller than pageserver_requests_sent_total.
+	 */
+	uint64		pageserver_send_flushes_total;
+
+	/*
+	 * Number of requests satisfied from the LFC.
+	 *
+	 * This is redundant with the server-wide file_cache_hits, but this gives
+	 * per-backend granularity, and it's handy to have this in the same place
+	 * as counters for requests that went to the pageserver. Maybe move all
+	 * the LFC stats to this struct in the future?
+	 */
+	uint64		file_cache_hits_total;
+
+} neon_per_backend_counters;
+
+/* Pointer to the shared memory array of neon_per_backend_counters structs */
+extern neon_per_backend_counters *neon_per_backend_counters_shared;
+
+#if PG_VERSION_NUM >= 170000
+#define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
+#else
+#define MyNeonCounters (&neon_per_backend_counters_shared[MyProc->pgprocno])
+#endif
+
+extern void inc_getpage_wait(uint64 latency);
+
+extern Size NeonPerfCountersShmemSize(void);
+extern bool NeonPerfCountersShmemInit(void);
+
+
+#endif							/* NEON_PERF_COUNTERS_H */
diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c
new file mode 100644
index 0000000000..a0dbddde4b
--- /dev/null
+++ b/pgxn/neon/neon_pgversioncompat.c
@@ -0,0 +1,44 @@
+/*
+ * Support functions for the compatibility macros in neon_pgversioncompat.h
+ */
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/tuplestore.h"
+
+#include "neon_pgversioncompat.h"
+
+#if PG_MAJORVERSION_NUM < 15
+void
+InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Tuplestorestate *tupstore;
+	MemoryContext old_context,
+				per_query_ctx;
+	TupleDesc	stored_tupdesc;
+
+	/* check to see if caller supports returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+
+	/*
+	 * Store the tuplestore and the tuple descriptor in ReturnSetInfo.  This
+	 * must be done in the per-query memory context.
+	 */
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	old_context = MemoryContextSwitchTo(per_query_ctx);
+
+	if (get_call_result_type(fcinfo, NULL, &stored_tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	tupstore = tuplestore_begin_heap(false, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = stored_tupdesc;
+	MemoryContextSwitchTo(old_context);
+}
+#endif
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index 59b97d64fe..e4754ec7ea 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -6,6 +6,8 @@
 #ifndef NEON_PGVERSIONCOMPAT_H
 #define NEON_PGVERSIONCOMPAT_H
 
+#include "fmgr.h"
+
 #if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
 #else
@@ -123,4 +125,8 @@
 #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
 #endif
 
+#if PG_MAJORVERSION_NUM < 15
+extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
+#endif
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 36538ea5e2..1c87f4405c 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -66,6 +66,7 @@
 #include "storage/md.h"
 #include "storage/smgr.h"
 
+#include "neon_perf_counters.h"
 #include "pagestore_client.h"
 #include "bitmap.h"
 
@@ -289,7 +290,6 @@ static PrefetchState *MyPState;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
 static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
 static bool prefetch_wait_for(uint64 ring_index);
@@ -780,21 +780,27 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 }
 
 /*
- * prefetch_register_buffer() - register and prefetch buffer
+ * prefetch_register_bufferv() - register and prefetch buffers
  *
  * Register that we may want the contents of BufferTag in the near future.
+ * This is used when issuing a speculative prefetch request, but also when
+ * performing a synchronous request and need the buffer right now.
  *
  * If force_request_lsns is not NULL, those values are sent to the
  * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
  * to calculate the LSNs to send.
  *
+ * When performing a prefetch rather than a synchronous request,
+ * is_prefetch==true. Currently, it only affects how the request is accounted
+ * in the perf counters.
+ *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
-
 static uint64
 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
-						  BlockNumber nblocks, const bits8 *mask)
+						  BlockNumber nblocks, const bits8 *mask,
+						  bool is_prefetch)
 {
 	uint64		min_ring_index;
 	PrefetchRequest req;
@@ -815,6 +821,7 @@ Retry:
 		PrfHashEntry *entry = NULL;
 		uint64		ring_index;
 		neon_request_lsns *lsns;
+
 		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
 			continue;
 
@@ -858,6 +865,7 @@ Retry:
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 					slot = NULL;
+					MyNeonCounters->getpage_prefetch_discards_total++;
 				}
 			}
 
@@ -972,6 +980,11 @@ Retry:
 
 		min_ring_index = Min(min_ring_index, ring_index);
 
+		if (is_prefetch)
+			MyNeonCounters->getpage_prefetch_requests_total++;
+		else
+			MyNeonCounters->getpage_sync_requests_total++;
+
 		prefetch_do_request(slot, lsns);
 	}
 
@@ -1000,13 +1013,6 @@ Retry:
 }
 
 
-static uint64
-prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
-{
-	return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL);
-}
-
-
 /*
  * Note: this function can get canceled and use a long jump to the next catch
  * context. Take care.
@@ -2612,7 +2618,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			lfc_present[i] = ~(lfc_present[i]);
 
 		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
-											   lfc_present);
+											   lfc_present, true);
 		nblocks -= iterblocks;
 		blocknum += iterblocks;
 
@@ -2656,7 +2662,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
-	ring_index = prefetch_register_buffer(tag, NULL);
+	ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true);
 
 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2747,17 +2753,20 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
 	 * value of the LwLsn cache when the entry is not found.
 	 */
-	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask);
+	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask, false);
 
 	for (int i = 0; i < nblocks; i++)
 	{
 		void	   *buffer = buffers[i];
 		BlockNumber blockno = base_blockno + i;
 		neon_request_lsns *reqlsns = &request_lsns[i];
+		TimestampTz		start_ts, end_ts;
 
 		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
 			continue;
 
+		start_ts = GetCurrentTimestamp();
+
 		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
 			XLogWaitForReplayOf(reqlsns[0].request_lsn);
 
@@ -2794,6 +2803,7 @@ Retry:
 				/* drop caches */
 				prefetch_set_unused(slot->my_ring_index);
 				pgBufferUsage.prefetch.expired += 1;
+				MyNeonCounters->getpage_prefetch_discards_total++;
 				/* make it look like a prefetch cache miss */
 				entry = NULL;
 			}
@@ -2804,8 +2814,9 @@ Retry:
 			if (entry == NULL)
 			{
 				pgBufferUsage.prefetch.misses += 1;
+				MyNeonCounters->getpage_prefetch_misses_total++;
 
-				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL);
+				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL, false);
 				Assert(ring_index != UINT64_MAX);
 				slot = GetPrfSlot(ring_index);
 			}
@@ -2860,6 +2871,9 @@ Retry:
 		/* buffer was used, clean up for later reuse */
 		prefetch_set_unused(ring_index);
 		prefetch_cleanup_trailing_unused();
+
+		end_ts = GetCurrentTimestamp();
+		inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
 	}
 }
 
@@ -2913,6 +2927,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/* Try to read from local file cache */
 	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 	{
+		MyNeonCounters->file_cache_hits_total++;
 		return;
 	}
 
@@ -3097,7 +3112,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				/* assume heap */
 				RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
 				RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
-	
+
 				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
 				{
 					neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
new file mode 100644
index 0000000000..6138c322d7
--- /dev/null
+++ b/test_runner/regress/test_compute_metrics.py
@@ -0,0 +1,21 @@
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_metrics(neon_simple_env: NeonEnv):
+    """
+    Test compute metrics, exposed in the neon_backend_perf_counters and
+    neon_perf_counters views
+    """
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    # We don't check that the values make sense, this is just a very
+    # basic check that the server doesn't crash or something like that.
+    #
+    # 1.5 is the minimum version to contain these views.
+    cur.execute("CREATE EXTENSION neon VERSION '1.5'")
+    cur.execute("SELECT * FROM neon_perf_counters")
+    cur.execute("SELECT * FROM neon_backend_perf_counters")
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index bb844244e3..22a6013225 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -50,8 +50,8 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
             # Ensure that the default version is also updated in the neon.control file
             assert cur.fetchone() == ("1.4",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
-            all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
-            current_version = "1.4"
+            all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"]
+            current_version = "1.5"
             for idx, begin_version in enumerate(all_versions):
                 for target_version in all_versions[idx + 1 :]:
                     if current_version != begin_version:

From 1c5d6e59a0c53349b58a7f1af1f9d021d34b147a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 23 Sep 2024 22:05:32 +0300
Subject: [PATCH 121/142] Maintain number of used pages for LFC (#9088)

## Problem

LFC cache entry is chunk (right now size of chunk is 1Mb). LFC
statistics shows number of chunks, but not number of used pages. And
autoscaling team wants to know how sparse LFC is:
https://neondb.slack.com/archives/C04DGM6SMTM/p1726782793595969
It is possible to obtain it from the view `select count(*) from
local_cache`.
Nut it is expensive operation, enumerating all entries in LFC under
lock.

## Summary of changes

This PR added "file_cache_used_pages" to `neon_lfc_stats` view:
```
 select * from neon_lfc_stats;
        lfc_key        | lfc_value
-----------------------+-----------
 file_cache_misses     |   3139029
 file_cache_hits       |   4098394
 file_cache_used       |      1024
 file_cache_writes     |   3173728
 file_cache_size       |      1024
 file_cache_used_pages |     25689
(6 rows)
```

Please notice that this PR doesn't change neon extension API, so no need
to create new version of Neon extension.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index ab6739465b..2b461c8641 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -109,6 +109,7 @@ typedef struct FileCacheControl
 								 * reenabling */
 	uint32		size;			/* size of cache file in chunks */
 	uint32		used;			/* number of used chunks */
+	uint32		used_pages;		/* number of used pages */
 	uint32		limit;			/* shared copy of lfc_size_limit */
 	uint64		hits;
 	uint64		misses;
@@ -905,6 +906,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				/* Cache overflow: evict least recently used chunk */
 				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
 	
+				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+				{
+					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+				}
 				CriticalAssert(victim->access_count == 0);
 				entry->offset = victim->offset; /* grab victim's chunk */
 				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -959,6 +964,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 				for (int i = 0; i < blocks_in_chunk; i++)
 				{
+					lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
 					entry->bitmap[(chunk_offs + i) >> 5] |=
 						(1 << ((chunk_offs + i) & 31));
 				}
@@ -1051,6 +1057,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->size;
 			break;
+		case 5:
+			key = "file_cache_used_pages";
+			if (lfc_ctl)
+				value = lfc_ctl->used_pages;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}

From d865881d59621e2425dd9028f2768c1e847163bf Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 23 Sep 2024 23:16:42 +0200
Subject: [PATCH 122/142] NOAI (#9084)

We can't FlushOneBuffer when we're in redo-only mode on PageServer, so
make execution of that function conditional on us not running in
pageserver walredo mode.
---
 .github/workflows/build_and_test.yml |  1 -
 test_runner/regress/test_unlogged.py | 16 +++++++++++++++-
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 vendor/postgres-v16                  |  2 +-
 vendor/postgres-v17                  |  2 +-
 vendor/revisions.json                |  8 ++++----
 7 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6617ca42bb..f36dbfb1f0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1207,7 +1207,6 @@ jobs:
     # Usually we do `needs: [...]`
     needs:
       - build-and-test-locally
-      - check-submodules
       - check-codestyle-python
       - check-codestyle-rust
       - promote-images
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index deba29536c..4431ccd959 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -15,8 +15,13 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur = conn.cursor()
 
     cur.execute("CREATE UNLOGGED TABLE iut (id int);")
-    # create index to test unlogged index relation as well
+    # create index to test unlogged index relations as well
     cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
+    cur.execute("CREATE INDEX ON iut USING gist (int4range(id, id, '[]'));")
+    cur.execute("CREATE INDEX ON iut USING spgist (int4range(id, id, '[]'));")
+    cur.execute("CREATE INDEX ON iut USING gin ((id::text::jsonb));")
+    cur.execute("CREATE INDEX ON iut USING brin (id);")
+    cur.execute("CREATE INDEX ON iut USING hash (id);")
     cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;")
     cur.execute("INSERT INTO iut (id) values (42);")
 
@@ -39,3 +44,12 @@ def test_unlogged(neon_simple_env: NeonEnv):
         assert results == [(43, 2)]
     else:
         assert results == [(43, 1)]
+
+    # Flush all data and compact it, so we detect any errors related to
+    # unlogged indexes materialization.
+    ps_http = env.pageserver.http_client()
+    ps_http.timeline_compact(
+        tenant_id=env.initial_tenant,
+        timeline_id=env.initial_timeline,
+        force_image_layer_creation=True,
+    )
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a38d15f323..2199b83fb7 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a38d15f3233a4c07f2bf3335fcbd874dd1f4e386
+Subproject commit 2199b83fb72680001ce0f43bf6187a21dfb8f45d
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 16c3c6b64f..22e580fe9f 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 16c3c6b64f1420a367a2a9b2510f20d94f791af8
+Subproject commit 22e580fe9ffcea7e02592110b1c9bf426d83cada
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 1d7081a3b0..e131a9c027 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 1d7081a3b076ddf5086e0b118d4329820e6a7427
+Subproject commit e131a9c027b202ce92bd7b9cf2569d48a6f9948e
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 2cf120e739..7b3e52c75c 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 2cf120e7393ca5f537c6a38b457585576dc035fc
+Subproject commit 7b3e52c75ca384de9c69477c158b1f5dcdcbb4be
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 9f6512d03e..bc7070744a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "2cf120e7393ca5f537c6a38b457585576dc035fc"
+    "7b3e52c75ca384de9c69477c158b1f5dcdcbb4be"
   ],
   "v16": [
     "16.4",
-    "1d7081a3b076ddf5086e0b118d4329820e6a7427"
+    "e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
   ],
   "v15": [
     "15.8",
-    "16c3c6b64f1420a367a2a9b2510f20d94f791af8"
+    "22e580fe9ffcea7e02592110b1c9bf426d83cada"
   ],
   "v14": [
     "14.13",
-    "a38d15f3233a4c07f2bf3335fcbd874dd1f4e386"
+    "2199b83fb72680001ce0f43bf6187a21dfb8f45d"
   ]
 }

From e7e6319e209cb0d90a7f0657e2fd7af5711cfab1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 00:31:32 +0300
Subject: [PATCH 123/142] Fix compiler warnings with nightly rustc about elided
 lifetimes having names (#9105)

The warnings:

    warning: elided lifetime has a name
        --> pageserver/src/metrics.rs:1386:29
         |
    1382 |     pub(crate) fn start_timer<'c: 'a, 'a>(
| -- lifetime `'a` declared here
    ...
    1386 |     ) -> Option<impl Drop + '_> {
| ^^ this elided lifetime gets resolved as `'a`
         |
         = note: `#[warn(elided_named_lifetimes)]` on by default

    warning: elided lifetime has a name
        --> pageserver/src/metrics.rs:1537:46
         |
    1534 |     pub(crate) fn start_recording<'c: 'a, 'a>(
| -- lifetime `'a` declared here
    ...
    1537 |     ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
| ^^ this elided lifetime gets resolved as `'a`

    warning: elided lifetime has a name
        --> pageserver/src/metrics.rs:1537:50
         |
    1534 |     pub(crate) fn start_recording<'c: 'a, 'a>(
| -- lifetime `'a` declared here
    ...
    1537 |     ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
| ^^ this elided lifetime gets resolved as `'a`

    warning: elided lifetime has a name
        --> pageserver/src/tenant.rs:3630:25
         |
    3622 |     async fn prepare_new_timeline<'a>(
| -- lifetime `'a` declared here
    ...
    3630 |     ) -> anyhow::Result<UninitializedTimeline> {
| ^^^^^^^^^^^^^^^^^^^^^ this elided lifetime gets resolved as `'a`
---
 pageserver/src/metrics.rs | 4 ++--
 pageserver/src/tenant.rs  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 078d12f934..162e8d1836 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1383,7 +1383,7 @@ impl SmgrQueryTimePerTimeline {
         &'a self,
         op: SmgrQueryType,
         ctx: &'c RequestContext,
-    ) -> Option<impl Drop + '_> {
+    ) -> Option<impl Drop + 'a> {
         let start = Instant::now();
 
         self.global_started[op as usize].inc();
@@ -1534,7 +1534,7 @@ impl BasebackupQueryTime {
     pub(crate) fn start_recording<'c: 'a, 'a>(
         &'a self,
         ctx: &'c RequestContext,
-    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+    ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
         let start = Instant::now();
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index be69f3d67f..5ed63734f4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3627,7 +3627,7 @@ impl Tenant {
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
         last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<UninitializedTimeline> {
+    ) -> anyhow::Result<UninitializedTimeline<'a>> {
         let tenant_shard_id = self.tenant_shard_id;
 
         let resources = self.build_timeline_resources(new_timeline_id);

From 3a110e45ed01d553e3f9229136ef969e1efb5adc Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:27:09 +0300
Subject: [PATCH 124/142] Move files related to building compute image into
 compute/ dir

Seems nice to keep all these together. This also provides a nice place
for a README file to describe the compute image build process. For
now, it briefly describes the contents of the directory, but can be
expanded.
---
 .dockerignore                                  |  1 +
 .github/workflows/build_and_test.yml           |  8 ++++----
 .github/workflows/trigger-e2e-tests.yml        |  2 +-
 .../Dockerfile.compute-node                    |  0
 compute/README.md                              | 18 ++++++++++++++++++
 {patches => compute/patches}/pg_anon.patch     |  0
 {patches => compute/patches}/pg_cron.patch     |  0
 .../patches}/pg_hint_plan.patch                |  0
 {patches => compute/patches}/pgvector.patch    |  0
 {patches => compute/patches}/rum.patch         |  0
 .../vm-image-spec.yaml                         |  0
 11 files changed, 24 insertions(+), 5 deletions(-)
 rename Dockerfile.compute-node => compute/Dockerfile.compute-node (100%)
 create mode 100644 compute/README.md
 rename {patches => compute/patches}/pg_anon.patch (100%)
 rename {patches => compute/patches}/pg_cron.patch (100%)
 rename {patches => compute/patches}/pg_hint_plan.patch (100%)
 rename {patches => compute/patches}/pgvector.patch (100%)
 rename {patches => compute/patches}/rum.patch (100%)
 rename vm-image-spec.yaml => compute/vm-image-spec.yaml (100%)

diff --git a/.dockerignore b/.dockerignore
index c7a2f78e32..3c4a748cf7 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -13,6 +13,7 @@
 # Directories
 !.cargo/
 !.config/
+!compute/
 !compute_tools/
 !control_plane/
 !libs/
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f36dbfb1f0..a634edb96b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -651,7 +651,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
           cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
@@ -670,7 +670,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
           target: neon-pg-ext-test
           cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
@@ -691,7 +691,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
           tags: |
             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
@@ -779,7 +779,7 @@ jobs:
       - name: Build vm image
         run: |
           ./vm-builder \
-            -spec=vm-image-spec.yaml \
+            -spec=compute/vm-image-spec.yaml \
             -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
             -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index b299cf9b99..f25c1051cd 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -107,7 +107,7 @@ jobs:
           if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
             for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
               case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
+                vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
                   platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                   ;;
                 *)
diff --git a/Dockerfile.compute-node b/compute/Dockerfile.compute-node
similarity index 100%
rename from Dockerfile.compute-node
rename to compute/Dockerfile.compute-node
diff --git a/compute/README.md b/compute/README.md
new file mode 100644
index 0000000000..981d290fc0
--- /dev/null
+++ b/compute/README.md
@@ -0,0 +1,18 @@
+This directory contains files that are needed to build the compute
+images, or included in the compute images.
+
+Dockerfile.compute-node
+	To build the compute image
+
+vm-image-spec.yaml
+	Instructions for vm-builder, to turn the compute-node image into
+	corresponding vm-compute-node image.
+
+patches/
+	Some extensions need to be patched to work with Neon. This
+	directory contains such patches. They are applied to the extension
+	sources in Dockerfile.compute-node
+
+In addition to these, postgres itself, the neon postgres extension,
+and compute_ctl are built and copied into the compute image by
+Dockerfile.compute-node.
diff --git a/patches/pg_anon.patch b/compute/patches/pg_anon.patch
similarity index 100%
rename from patches/pg_anon.patch
rename to compute/patches/pg_anon.patch
diff --git a/patches/pg_cron.patch b/compute/patches/pg_cron.patch
similarity index 100%
rename from patches/pg_cron.patch
rename to compute/patches/pg_cron.patch
diff --git a/patches/pg_hint_plan.patch b/compute/patches/pg_hint_plan.patch
similarity index 100%
rename from patches/pg_hint_plan.patch
rename to compute/patches/pg_hint_plan.patch
diff --git a/patches/pgvector.patch b/compute/patches/pgvector.patch
similarity index 100%
rename from patches/pgvector.patch
rename to compute/patches/pgvector.patch
diff --git a/patches/rum.patch b/compute/patches/rum.patch
similarity index 100%
rename from patches/rum.patch
rename to compute/patches/rum.patch
diff --git a/vm-image-spec.yaml b/compute/vm-image-spec.yaml
similarity index 100%
rename from vm-image-spec.yaml
rename to compute/vm-image-spec.yaml

From 3ad567290c99b48a3293ed3f609a701375541382 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:27:38 +0300
Subject: [PATCH 125/142] Move metric exporter and pgbouncer config files

Instead of adding them to the VM image late in the build process, when
putting together the final VM image, include them in the earlier
compute image already. That makes it more convenient to edit the
files, and to test them.
---
 compute/Dockerfile.compute-node            |  61 ++-
 compute/README.md                          |   3 +
 compute/etc/neon_collector.yml             | 247 ++++++++++++
 compute/etc/neon_collector_autoscaling.yml |  55 +++
 compute/etc/pgbouncer.ini                  |  17 +
 compute/etc/sql_exporter.yml               |  33 ++
 compute/etc/sql_exporter_autoscaling.yml   |  33 ++
 compute/vm-image-spec.yaml                 | 440 +--------------------
 8 files changed, 444 insertions(+), 445 deletions(-)
 create mode 100644 compute/etc/neon_collector.yml
 create mode 100644 compute/etc/neon_collector_autoscaling.yml
 create mode 100644 compute/etc/pgbouncer.ini
 create mode 100644 compute/etc/sql_exporter.yml
 create mode 100644 compute/etc/sql_exporter_autoscaling.yml

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 6bf6fb650f..18c68c116a 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -280,7 +280,7 @@ FROM build-deps AS vector-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/pgvector.patch /pgvector.patch
+COPY compute/patches/pgvector.patch /pgvector.patch
 
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
@@ -366,7 +366,7 @@ FROM build-deps AS rum-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/rum.patch /rum.patch
+COPY compute/patches/rum.patch /rum.patch
 
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -1031,6 +1031,41 @@ FROM debian:bullseye-slim AS compute-tools-image
 
 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
+#########################################################################################
+#
+# Layer "pgbouncer"
+#
+#########################################################################################
+
+FROM debian:bullseye-slim AS pgbouncer
+RUN set -e \
+    && apt-get update \
+    && apt-get install -y \
+        build-essential \
+        git \
+        libevent-dev \
+        libtool \
+        pkg-config
+
+# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
+ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+RUN set -e \
+    && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
+    && cd pgbouncer \
+    && ./autogen.sh \
+    && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
+    && make -j $(nproc) dist_man_MANS= \
+    && make install dist_man_MANS=
+
+#########################################################################################
+#
+# Layers "postgres-exporter" and "sql-exporter"
+#
+#########################################################################################
+
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
+FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
+
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1078,7 +1113,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
 COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+COPY compute/patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -1086,9 +1121,9 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hint_plan.patch /ext-src
+COPY compute/patches/pg_hint_plan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY patches/pg_cron.patch /ext-src
+COPY compute/patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
 #COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
@@ -1097,7 +1132,7 @@ COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
 COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
-COPY patches/pg_anon.patch /ext-src
+COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
 RUN case "${PG_VERSION}" in "v17") \
@@ -1160,9 +1195,23 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
+# pgbouncer and its config
+COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
+COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
+
+# Metrics exporter binaries and  configuration files
+COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
+
+COPY --chmod=0644 compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
+COPY --chmod=0644 compute/etc/neon_collector.yml             /etc/neon_collector.yml
+COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
+COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
+
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
 
+
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
diff --git a/compute/README.md b/compute/README.md
index 981d290fc0..bb1e42ab53 100644
--- a/compute/README.md
+++ b/compute/README.md
@@ -8,6 +8,9 @@ vm-image-spec.yaml
 	Instructions for vm-builder, to turn the compute-node image into
 	corresponding vm-compute-node image.
 
+etc/
+	Configuration files included in /etc in the compute image
+
 patches/
 	Some extensions need to be patched to work with Neon. This
 	directory contains such patches. They are applied to the extension
diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml
new file mode 100644
index 0000000000..29be0958dd
--- /dev/null
+++ b/compute/etc/neon_collector.yml
@@ -0,0 +1,247 @@
+collector_name: neon_collector
+metrics:
+- metric_name: lfc_misses
+  type: gauge
+  help: 'lfc_misses'
+  key_labels:
+  values: [lfc_misses]
+  query: |
+    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+- metric_name: lfc_used
+  type: gauge
+  help: 'LFC chunks used (chunk = 1MB)'
+  key_labels:
+  values: [lfc_used]
+  query: |
+    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+- metric_name: lfc_hits
+  type: gauge
+  help: 'lfc_hits'
+  key_labels:
+  values: [lfc_hits]
+  query: |
+    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+- metric_name: lfc_writes
+  type: gauge
+  help: 'lfc_writes'
+  key_labels:
+  values: [lfc_writes]
+  query: |
+    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+- metric_name: lfc_cache_size_limit
+  type: gauge
+  help: 'LFC cache size limit in bytes'
+  key_labels:
+  values: [lfc_cache_size_limit]
+  query: |
+    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+- metric_name: connection_counts
+  type: gauge
+  help: 'Connection counts'
+  key_labels:
+    - datname
+    - state
+  values: [count]
+  query: |
+    select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
+
+- metric_name: pg_stats_userdb
+  type: gauge
+  help: 'Stats for several oldest non-system dbs'
+  key_labels:
+    - datname
+  value_label: kind
+  values:
+    - db_size
+    - deadlocks
+    # Rows
+    - inserted
+    - updated
+    - deleted
+  # We export stats for 10 non-system database. Without this limit
+  # it is too easy to abuse the system by creating lots of databases.
+  query: |
+    select pg_database_size(datname) as db_size, deadlocks,
+       tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
+       datname
+     from pg_stat_database
+     where datname IN (
+       select datname
+       from pg_database
+       where datname <> 'postgres' and not datistemplate
+       order by oid
+       limit 10
+     );
+
+- metric_name: max_cluster_size
+  type: gauge
+  help: 'neon.max_cluster_size setting'
+  key_labels:
+  values: [max_cluster_size]
+  query: |
+    select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
+
+- metric_name: db_total_size
+  type: gauge
+  help: 'Size of all databases'
+  key_labels:
+  values: [total]
+  query: |
+    select sum(pg_database_size(datname)) as total from pg_database;
+
+# DEPRECATED
+- metric_name: lfc_approximate_working_set_size
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels:
+  values: [approximate_working_set_size]
+  query: |
+    select neon.approximate_working_set_size(false) as approximate_working_set_size;
+
+- metric_name: lfc_approximate_working_set_size_windows
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels: [duration]
+  values: [size]
+  # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
+  # of durations in a pretty-printed form.
+  query: |
+    select
+      x as duration,
+      neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
+    from
+      (values ('5m'),('15m'),('1h')) as t (x);
+
+- metric_name: compute_current_lsn
+  type: gauge
+  help: 'Current LSN of the database'
+  key_labels:
+  values: [lsn]
+  query: |
+    select
+      case
+        when pg_catalog.pg_is_in_recovery()
+        then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+        else (pg_current_wal_lsn() - '0/0')::FLOAT8
+      end as lsn;
+
+- metric_name: compute_receive_lsn
+  type: gauge
+  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
+  key_labels:
+  values: [lsn]
+  query: |
+    SELECT
+      CASE
+        WHEN pg_catalog.pg_is_in_recovery()
+        THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+        ELSE 0
+      END AS lsn;
+
+- metric_name: replication_delay_bytes
+  type: gauge
+  help: 'Bytes between received and replayed LSN'
+  key_labels:
+  values: [replication_delay_bytes]
+  # We use a GREATEST call here because this calculation can be negative.
+  # The calculation is not atomic, meaning after we've gotten the receive
+  # LSN, the replay LSN may have advanced past the receive LSN we
+  # are using for the calculation.
+  query: |
+    SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
+
+- metric_name: replication_delay_seconds
+  type: gauge
+  help: 'Time since last LSN was replayed'
+  key_labels:
+  values: [replication_delay_seconds]
+  query: |
+    SELECT
+      CASE
+        WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+        ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+     END AS replication_delay_seconds;
+
+- metric_name: checkpoints_req
+  type: gauge
+  help: 'Number of requested checkpoints'
+  key_labels:
+  values: [checkpoints_req]
+  query: |
+    SELECT checkpoints_req FROM pg_stat_bgwriter;
+
+- metric_name: checkpoints_timed
+  type: gauge
+  help: 'Number of scheduled checkpoints'
+  key_labels:
+  values: [checkpoints_timed]
+  query: |
+    SELECT checkpoints_timed FROM pg_stat_bgwriter;
+
+- metric_name: compute_logical_snapshot_files
+  type: gauge
+  help: 'Number of snapshot files in pg_logical/snapshot'
+  key_labels:
+    - timeline_id
+  values: [num_logical_snapshot_files]
+  query: |
+    SELECT
+      (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+      -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
+      -- temporary snapshot files are renamed to the actual snapshot files after they are
+      -- completely built. We only WAL-log the completely built snapshot files.
+      (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+
+# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
+# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
+- metric_name: logical_slot_restart_lsn
+  type: gauge
+  help: 'restart_lsn of logical slots'
+  key_labels:
+    - slot_name
+  values: [restart_lsn]
+  query: |
+    select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+    from pg_replication_slots
+    where slot_type = 'logical';
+
+- metric_name: compute_subscriptions_count
+  type: gauge
+  help: 'Number of logical replication subscriptions grouped by enabled/disabled'
+  key_labels:
+    - enabled
+  values: [subscriptions_count]
+  query: |
+    select subenabled::text as enabled, count(*) as subscriptions_count
+    from pg_subscription
+    group by subenabled;
+
+- metric_name: retained_wal
+  type: gauge
+  help: 'Retained WAL in inactive replication slots'
+  key_labels:
+    - slot_name
+  values: [retained_wal]
+  query: |
+    SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+    FROM pg_replication_slots
+    WHERE active = false;
+
+- metric_name: wal_is_lost
+  type: gauge
+  help: 'Whether or not the replication slot wal_status is lost'
+  key_labels:
+    - slot_name
+  values: [wal_is_lost]
+  query: |
+    SELECT slot_name,
+           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
+    FROM pg_replication_slots;
+
diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml
new file mode 100644
index 0000000000..5616264eba
--- /dev/null
+++ b/compute/etc/neon_collector_autoscaling.yml
@@ -0,0 +1,55 @@
+collector_name: neon_collector_autoscaling
+metrics:
+- metric_name: lfc_misses
+  type: gauge
+  help: 'lfc_misses'
+  key_labels:
+  values: [lfc_misses]
+  query: |
+    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+- metric_name: lfc_used
+  type: gauge
+  help: 'LFC chunks used (chunk = 1MB)'
+  key_labels:
+  values: [lfc_used]
+  query: |
+    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+- metric_name: lfc_hits
+  type: gauge
+  help: 'lfc_hits'
+  key_labels:
+  values: [lfc_hits]
+  query: |
+    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+- metric_name: lfc_writes
+  type: gauge
+  help: 'lfc_writes'
+  key_labels:
+  values: [lfc_writes]
+  query: |
+    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+- metric_name: lfc_cache_size_limit
+  type: gauge
+  help: 'LFC cache size limit in bytes'
+  key_labels:
+  values: [lfc_cache_size_limit]
+  query: |
+    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+- metric_name: lfc_approximate_working_set_size_windows
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels: [duration_seconds]
+  values: [size]
+  # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
+  # size looking back 1..60 minutes, labeled with the number of minutes.
+  query: |
+    select
+      x::text as duration_seconds,
+      neon.approximate_working_set_size_seconds(x) as size
+    from
+      (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
new file mode 100644
index 0000000000..cb994f961c
--- /dev/null
+++ b/compute/etc/pgbouncer.ini
@@ -0,0 +1,17 @@
+[databases]
+*=host=localhost port=5432 auth_user=cloud_admin
+[pgbouncer]
+listen_port=6432
+listen_addr=0.0.0.0
+auth_type=scram-sha-256
+auth_user=cloud_admin
+auth_dbname=postgres
+client_tls_sslmode=disable
+server_tls_sslmode=disable
+pool_mode=transaction
+max_client_conn=10000
+default_pool_size=64
+max_prepared_statements=0
+admin_users=postgres
+unix_socket_dir=/tmp/
+unix_socket_mode=0777
diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml
new file mode 100644
index 0000000000..139d04468a
--- /dev/null
+++ b/compute/etc/sql_exporter.yml
@@ -0,0 +1,33 @@
+# Configuration for sql_exporter
+# Global defaults.
+global:
+  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+  scrape_timeout: 10s
+  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+  scrape_timeout_offset: 500ms
+  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+  min_interval: 0s
+  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+  # as will concurrent scrapes.
+  max_connections: 1
+  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+  # always be the same as max_connections.
+  max_idle_connections: 1
+  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+  # If 0, connections are not closed due to a connection's age.
+  max_connection_lifetime: 5m
+
+# The target to monitor and the collectors to execute on it.
+target:
+  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+  # the schema gets dropped or replaced to match the driver expected DSN format.
+  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
+
+  # Collectors (referenced by name) to execute on the target.
+  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collectors: [neon_collector]
+
+# Collector files specifies a list of globs. One collector definition is read from each matching file.
+# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+collector_files:
+  - "neon_collector.yml"
diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml
new file mode 100644
index 0000000000..044557233e
--- /dev/null
+++ b/compute/etc/sql_exporter_autoscaling.yml
@@ -0,0 +1,33 @@
+# Configuration for sql_exporter for autoscaling-agent
+# Global defaults.
+global:
+  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+  scrape_timeout: 10s
+  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+  scrape_timeout_offset: 500ms
+  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+  min_interval: 0s
+  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+  # as will concurrent scrapes.
+  max_connections: 1
+  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+  # always be the same as max_connections.
+  max_idle_connections: 1
+  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+  # If 0, connections are not closed due to a connection's age.
+  max_connection_lifetime: 5m
+
+# The target to monitor and the collectors to execute on it.
+target:
+  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+  # the schema gets dropped or replaced to match the driver expected DSN format.
+  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
+
+  # Collectors (referenced by name) to execute on the target.
+  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collectors: [neon_collector_autoscaling]
+
+# Collector files specifies a list of globs. One collector definition is read from each matching file.
+# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+collector_files:
+  - "neon_collector_autoscaling.yml"
diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec.yaml
index c94f95f447..0af44745e5 100644
--- a/compute/vm-image-spec.yaml
+++ b/compute/vm-image-spec.yaml
@@ -35,25 +35,6 @@ files:
       # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
       # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
       postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
-  - filename: pgbouncer.ini
-    content: |
-      [databases]
-      *=host=localhost port=5432 auth_user=cloud_admin
-      [pgbouncer]
-      listen_port=6432
-      listen_addr=0.0.0.0
-      auth_type=scram-sha-256
-      auth_user=cloud_admin
-      auth_dbname=postgres
-      client_tls_sslmode=disable
-      server_tls_sslmode=disable
-      pool_mode=transaction
-      max_client_conn=10000
-      default_pool_size=64
-      max_prepared_statements=0
-      admin_users=postgres
-      unix_socket_dir=/tmp/
-      unix_socket_mode=0777
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
@@ -68,385 +49,6 @@ files:
           }
           memory {}
       }
-  - filename: sql_exporter.yml
-    content: |
-      # Configuration for sql_exporter
-      # Global defaults.
-      global:
-        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-        scrape_timeout: 10s
-        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-        scrape_timeout_offset: 500ms
-        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-        min_interval: 0s
-        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-        # as will concurrent scrapes.
-        max_connections: 1
-        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-        # always be the same as max_connections.
-        max_idle_connections: 1
-        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-        # If 0, connections are not closed due to a connection's age.
-        max_connection_lifetime: 5m
-
-      # The target to monitor and the collectors to execute on it.
-      target:
-        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
-
-        # Collectors (referenced by name) to execute on the target.
-        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-        collectors: [neon_collector]
-
-      # Collector files specifies a list of globs. One collector definition is read from each matching file.
-      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-      collector_files:
-        - "neon_collector.yml"
-  - filename: sql_exporter_autoscaling.yml
-    content: |
-      # Configuration for sql_exporter for autoscaling-agent
-      # Global defaults.
-      global:
-        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-        scrape_timeout: 10s
-        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-        scrape_timeout_offset: 500ms
-        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-        min_interval: 0s
-        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-        # as will concurrent scrapes.
-        max_connections: 1
-        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-        # always be the same as max_connections.
-        max_idle_connections: 1
-        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-        # If 0, connections are not closed due to a connection's age.
-        max_connection_lifetime: 5m
-
-      # The target to monitor and the collectors to execute on it.
-      target:
-        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
-
-        # Collectors (referenced by name) to execute on the target.
-        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-        collectors: [neon_collector_autoscaling]
-
-      # Collector files specifies a list of globs. One collector definition is read from each matching file.
-      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-      collector_files:
-        - "neon_collector_autoscaling.yml"
-  - filename: neon_collector.yml
-    content: |
-      collector_name: neon_collector
-      metrics:
-      - metric_name: lfc_misses
-        type: gauge
-        help: 'lfc_misses'
-        key_labels:
-        values: [lfc_misses]
-        query: |
-          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-      - metric_name: lfc_used
-        type: gauge
-        help: 'LFC chunks used (chunk = 1MB)'
-        key_labels:
-        values: [lfc_used]
-        query: |
-          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-      - metric_name: lfc_hits
-        type: gauge
-        help: 'lfc_hits'
-        key_labels:
-        values: [lfc_hits]
-        query: |
-          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-      - metric_name: lfc_writes
-        type: gauge
-        help: 'lfc_writes'
-        key_labels:
-        values: [lfc_writes]
-        query: |
-          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-      - metric_name: lfc_cache_size_limit
-        type: gauge
-        help: 'LFC cache size limit in bytes'
-        key_labels:
-        values: [lfc_cache_size_limit]
-        query: |
-          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-      - metric_name: connection_counts
-        type: gauge
-        help: 'Connection counts'
-        key_labels:
-          - datname
-          - state
-        values: [count]
-        query: |
-          select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
-
-      - metric_name: pg_stats_userdb
-        type: gauge
-        help: 'Stats for several oldest non-system dbs'
-        key_labels:
-          - datname
-        value_label: kind
-        values:
-          - db_size
-          - deadlocks
-          # Rows
-          - inserted
-          - updated
-          - deleted
-        # We export stats for 10 non-system database. Without this limit
-        # it is too easy to abuse the system by creating lots of databases.
-        query: |
-          select pg_database_size(datname) as db_size, deadlocks,
-                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
-                 datname
-            from pg_stat_database
-           where datname IN (
-             select datname
-               from pg_database
-              where datname <> 'postgres' and not datistemplate
-              order by oid
-              limit 10
-           );
-
-      - metric_name: max_cluster_size
-        type: gauge
-        help: 'neon.max_cluster_size setting'
-        key_labels:
-        values: [max_cluster_size]
-        query: |
-          select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
-
-      - metric_name: db_total_size
-        type: gauge
-        help: 'Size of all databases'
-        key_labels:
-        values: [total]
-        query: |
-          select sum(pg_database_size(datname)) as total from pg_database;
-
-      # DEPRECATED
-      - metric_name: lfc_approximate_working_set_size
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels:
-        values: [approximate_working_set_size]
-        query: |
-          select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
-      - metric_name: lfc_approximate_working_set_size_windows
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels: [duration]
-        values: [size]
-        # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
-        # of durations in a pretty-printed form.
-        query: |
-          select
-            x as duration,
-            neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
-          from
-            (values ('5m'),('15m'),('1h')) as t (x);
-
-      - metric_name: compute_current_lsn
-        type: gauge
-        help: 'Current LSN of the database'
-        key_labels:
-        values: [lsn]
-        query: |
-          select
-            case
-              when pg_catalog.pg_is_in_recovery()
-              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
-              else (pg_current_wal_lsn() - '0/0')::FLOAT8
-            end as lsn;
-
-      - metric_name: compute_receive_lsn
-        type: gauge
-        help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
-        key_labels:
-        values: [lsn]
-        query: |
-          SELECT
-            CASE
-              WHEN pg_catalog.pg_is_in_recovery()
-              THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
-              ELSE 0
-            END AS lsn;
-
-      - metric_name: replication_delay_bytes
-        type: gauge
-        help: 'Bytes between received and replayed LSN'
-        key_labels:
-        values: [replication_delay_bytes]
-        # We use a GREATEST call here because this calculation can be negative.
-        # The calculation is not atomic, meaning after we've gotten the receive
-        # LSN, the replay LSN may have advanced past the receive LSN we
-        # are using for the calculation.
-        query: |
-          SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
-
-      - metric_name: replication_delay_seconds
-        type: gauge
-        help: 'Time since last LSN was replayed'
-        key_labels:
-        values: [replication_delay_seconds]
-        query: |
-          SELECT
-            CASE
-              WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
-              ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
-            END AS replication_delay_seconds;
-
-      - metric_name: checkpoints_req
-        type: gauge
-        help: 'Number of requested checkpoints'
-        key_labels:
-        values: [checkpoints_req]
-        query: |
-          SELECT checkpoints_req FROM pg_stat_bgwriter;
-
-      - metric_name: checkpoints_timed
-        type: gauge
-        help: 'Number of scheduled checkpoints'
-        key_labels:
-        values: [checkpoints_timed]
-        query: |
-          SELECT checkpoints_timed FROM pg_stat_bgwriter;
-
-      - metric_name: compute_logical_snapshot_files
-        type: gauge
-        help: 'Number of snapshot files in pg_logical/snapshot'
-        key_labels:
-          - timeline_id
-        values: [num_logical_snapshot_files]
-        query: |
-          SELECT
-            (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-            -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-            -- temporary snapshot files are renamed to the actual snapshot files after they are
-            -- completely built. We only WAL-log the completely built snapshot files.
-            (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
-
-      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
-      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
-
-      # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
-      - metric_name: logical_slot_restart_lsn
-        type: gauge
-        help: 'restart_lsn of logical slots'
-        key_labels:
-          - slot_name
-        values: [restart_lsn]
-        query: |
-          select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-          from pg_replication_slots
-          where slot_type = 'logical';
-
-      - metric_name: compute_subscriptions_count
-        type: gauge
-        help: 'Number of logical replication subscriptions grouped by enabled/disabled'
-        key_labels:
-          - enabled
-        values: [subscriptions_count]
-        query: |
-          select subenabled::text as enabled, count(*) as subscriptions_count
-          from pg_subscription
-          group by subenabled;
-
-      - metric_name: retained_wal
-        type: gauge
-        help: 'Retained WAL in inactive replication slots'
-        key_labels:
-          - slot_name
-        values: [retained_wal]
-        query: |
-          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
-          FROM pg_replication_slots
-          WHERE active = false;
-
-      - metric_name: wal_is_lost
-        type: gauge
-        help: 'Whether or not the replication slot wal_status is lost'
-        key_labels:
-          - slot_name
-        values: [wal_is_lost]
-        query: |
-          SELECT slot_name,
-          CASE
-            WHEN wal_status = 'lost' THEN 1
-            ELSE 0
-          END AS wal_is_lost
-          FROM pg_replication_slots;
-
-  - filename: neon_collector_autoscaling.yml
-    content: |
-      collector_name: neon_collector_autoscaling
-      metrics:
-      - metric_name: lfc_misses
-        type: gauge
-        help: 'lfc_misses'
-        key_labels:
-        values: [lfc_misses]
-        query: |
-          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-      - metric_name: lfc_used
-        type: gauge
-        help: 'LFC chunks used (chunk = 1MB)'
-        key_labels:
-        values: [lfc_used]
-        query: |
-          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-      - metric_name: lfc_hits
-        type: gauge
-        help: 'lfc_hits'
-        key_labels:
-        values: [lfc_hits]
-        query: |
-          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-      - metric_name: lfc_writes
-        type: gauge
-        help: 'lfc_writes'
-        key_labels:
-        values: [lfc_writes]
-        query: |
-          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-      - metric_name: lfc_cache_size_limit
-        type: gauge
-        help: 'LFC cache size limit in bytes'
-        key_labels:
-        values: [lfc_cache_size_limit]
-        query: |
-          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-      - metric_name: lfc_approximate_working_set_size_windows
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels: [duration_seconds]
-        values: [size]
-        # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
-        # size looking back 1..60 minutes, labeled with the number of minutes.
-        query: |
-          select
-            x::text as duration_seconds,
-            neon.approximate_working_set_size_seconds(x) as size
-          from
-            (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
 build: |
   # Build cgroup-tools
   #
@@ -480,32 +82,6 @@ build: |
       && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
       # actually build the thing...
       && make install
-
-  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
-
-  FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
-
-  # Build pgbouncer
-  #
-  FROM debian:bullseye-slim AS pgbouncer
-  RUN set -e \
-      && apt-get update \
-      && apt-get install -y \
-          build-essential \
-          git \
-          libevent-dev \
-          libtool \
-          pkg-config
-
-  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG=pgbouncer_1_22_1
-  RUN set -e \
-      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
-      && cd pgbouncer \
-      && ./autogen.sh \
-      && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
-      && make -j $(nproc) dist_man_MANS= \
-      && make install dist_man_MANS=
 merge: |
   # tweak nofile limits
   RUN set -e \
@@ -527,24 +103,10 @@ merge: |
   COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
 
   COPY cgconfig.conf /etc/cgconfig.conf
-  COPY pgbouncer.ini /etc/pgbouncer.ini
-  COPY sql_exporter.yml /etc/sql_exporter.yml
-  COPY neon_collector.yml /etc/neon_collector.yml
-  COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
-  COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
   RUN set -e \
-      && chown postgres:postgres /etc/pgbouncer.ini \
-      && chmod 0666 /etc/pgbouncer.ini \
-      && chmod 0644 /etc/cgconfig.conf \
-      && chmod 0644 /etc/sql_exporter.yml \
-      && chmod 0644 /etc/neon_collector.yml \
-      && chmod 0644 /etc/sql_exporter_autoscaling.yml \
-      && chmod 0644 /etc/neon_collector_autoscaling.yml
+      && chmod 0644 /etc/cgconfig.conf
 
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
   COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
-  COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
-  COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
-  COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer

From 37aa6fd953285da7480cf23ab1ddfd2f6958b55e Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 23 Sep 2024 17:58:12 -0400
Subject: [PATCH 126/142] scrubber: retry when missing index key in the listing
 (#8873)

Part of #8128, fixes #8872.

## Problem

See #8872.

## Summary of changes

- Retry `list_timeline_blobs` another time if
  - there are layer file keys listed but not index.
  - failed to download index.
- Instrument code with `analyze-tenant` and `analyze-timeline` span.
- Remove `initdb_archive` check, it could have been deleted.
- Return with exit code 1 on fatal error if `--exit-code` parameter is set.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/checks.rs                | 133 +++++++++++++-----
 storage_scrubber/src/main.rs                  |  15 ++
 .../src/scan_pageserver_metadata.rs           |  86 ++++++-----
 3 files changed, 163 insertions(+), 71 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index de6918b3da..525f412b56 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,13 +1,12 @@
 use std::collections::{HashMap, HashSet};
 
-use anyhow::Context;
 use itertools::Itertools;
 use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
+use tracing::{info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 
@@ -29,9 +28,8 @@ pub(crate) struct TimelineAnalysis {
     /// yet.
     pub(crate) warnings: Vec<String>,
 
-    /// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware
-    /// of races between reading the metadata and reading the objects.
-    pub(crate) garbage_keys: Vec<String>,
+    /// Objects whose keys were not recognized at all, i.e. not layer files, not indices, and not initdb archive.
+    pub(crate) unknown_keys: Vec<String>,
 }
 
 impl TimelineAnalysis {
@@ -39,7 +37,7 @@ impl TimelineAnalysis {
         Self {
             errors: Vec::new(),
             warnings: Vec::new(),
-            garbage_keys: Vec::new(),
+            unknown_keys: Vec::new(),
         }
     }
 
@@ -59,7 +57,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 ) -> TimelineAnalysis {
     let mut result = TimelineAnalysis::new();
 
-    info!("Checking timeline {id}");
+    info!("Checking timeline");
 
     if let Some(s3_active_branch) = s3_active_branch {
         info!(
@@ -80,7 +78,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
     match s3_data {
         Some(s3_data) => {
             result
-                .garbage_keys
+                .unknown_keys
                 .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
 
             match s3_data.blob_data {
@@ -204,10 +202,10 @@ pub(crate) async fn branch_cleanup_and_check_errors(
         warn!("Timeline metadata warnings: {0:?}", result.warnings);
     }
 
-    if !result.garbage_keys.is_empty() {
-        error!(
-            "The following keys should be removed from S3: {0:?}",
-            result.garbage_keys
+    if !result.unknown_keys.is_empty() {
+        warn!(
+            "The following keys are not recognized: {0:?}",
+            result.unknown_keys
         )
     }
 
@@ -294,10 +292,10 @@ impl TenantObjectListing {
 pub(crate) struct RemoteTimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
 
-    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
+    /// Index objects that were not used when loading `blob_data`, e.g. those from old generations
     pub(crate) unused_index_keys: Vec<ListingObject>,
 
-    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
+    /// Objects whose keys were not recognized at all, i.e. not layer files, not indices
     pub(crate) unknown_keys: Vec<ListingObject>,
 }
 
@@ -329,11 +327,54 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
+/// Note (<https://github.com/neondatabase/neon/issues/8872>):
+/// Since we do not gurantee the order of the listing, we could list layer keys right before
+/// pageserver `RemoteTimelineClient` deletes the layer files and then the index.
+/// In the rare case, this would give back a transient error where the index key is missing.
+///
+/// To avoid generating false positive, we try streaming the listing for a second time.
 pub(crate) async fn list_timeline_blobs(
     remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
     root_target: &RootTarget,
 ) -> anyhow::Result<RemoteTimelineBlobData> {
+    let res = list_timeline_blobs_impl(remote_client, id, root_target).await?;
+    match res {
+        ListTimelineBlobsResult::Ready(data) => Ok(data),
+        ListTimelineBlobsResult::MissingIndexPart(_) => {
+            // Retry if index is missing.
+            let data = list_timeline_blobs_impl(remote_client, id, root_target)
+                .await?
+                .into_data();
+            Ok(data)
+        }
+    }
+}
+
+enum ListTimelineBlobsResult {
+    /// Blob data is ready to be intepreted.
+    Ready(RemoteTimelineBlobData),
+    /// List timeline blobs has layer files but is missing [`IndexPart`].
+    MissingIndexPart(RemoteTimelineBlobData),
+}
+
+impl ListTimelineBlobsResult {
+    /// Get the inner blob data regardless the status.
+    pub fn into_data(self) -> RemoteTimelineBlobData {
+        match self {
+            ListTimelineBlobsResult::Ready(data) => data,
+            ListTimelineBlobsResult::MissingIndexPart(data) => data,
+        }
+    }
+}
+
+/// Returns [`ListTimelineBlobsResult::MissingIndexPart`] if blob data has layer files
+/// but is missing [`IndexPart`], otherwise returns [`ListTimelineBlobsResult::Ready`].
+async fn list_timeline_blobs_impl(
+    remote_client: &GenericRemoteStorage,
+    id: TenantShardTimelineId,
+    root_target: &RootTarget,
+) -> anyhow::Result<ListTimelineBlobsResult> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
@@ -375,30 +416,28 @@ pub(crate) async fn list_timeline_blobs(
                     s3_layers.insert((new_layer, gen));
                 }
                 Err(e) => {
-                    tracing::info!("Error parsing key {maybe_layer_name}");
-                    errors.push(
-                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
-                    );
+                    tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}");
                     unknown_keys.push(obj);
                 }
             },
             None => {
-                tracing::warn!("Unknown key {key}");
-                errors.push(format!("S3 list response got an object with odd key {key}"));
+                tracing::info!("S3 listed an unknown key: {key}");
                 unknown_keys.push(obj);
             }
         }
     }
 
-    if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
-        tracing::debug!(
-            "Timeline is empty apart from initdb archive: expected post-deletion state."
-        );
-        return Ok(RemoteTimelineBlobData {
+    if index_part_keys.is_empty() && s3_layers.is_empty() {
+        tracing::debug!("Timeline is empty: expected post-deletion state.");
+        if initdb_archive {
+            tracing::info!("Timeline is post deletion but initdb archive is still present.");
+        }
+
+        return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
             blob_data: BlobDataParseResult::Relic,
             unused_index_keys: index_part_keys,
-            unknown_keys: Vec::new(),
-        });
+            unknown_keys,
+        }));
     }
 
     // Choose the index_part with the highest generation
@@ -424,19 +463,43 @@ pub(crate) async fn list_timeline_blobs(
     match index_part_object.as_ref() {
         Some(selected) => index_part_keys.retain(|k| k != selected),
         None => {
-            errors.push("S3 list response got no index_part.json file".to_string());
+            // It is possible that the branch gets deleted after we got some layer files listed
+            // and we no longer have the index file in the listing.
+            errors.push(
+                "S3 list response got no index_part.json file but still has layer files"
+                    .to_string(),
+            );
+            return Ok(ListTimelineBlobsResult::MissingIndexPart(
+                RemoteTimelineBlobData {
+                    blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
+                    unused_index_keys: index_part_keys,
+                    unknown_keys,
+                },
+            ));
         }
     }
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
         let index_part_bytes =
-            download_object_with_retries(remote_client, &index_part_object_key.key)
-                .await
-                .context("index_part.json download")?;
+            match download_object_with_retries(remote_client, &index_part_object_key.key).await {
+                Ok(index_part_bytes) => index_part_bytes,
+                Err(e) => {
+                    // It is possible that the branch gets deleted in-between we list the objects
+                    // and we download the index part file.
+                    errors.push(format!("failed to download index_part.json: {e}"));
+                    return Ok(ListTimelineBlobsResult::MissingIndexPart(
+                        RemoteTimelineBlobData {
+                            blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
+                            unused_index_keys: index_part_keys,
+                            unknown_keys,
+                        },
+                    ));
+                }
+            };
 
         match serde_json::from_slice(&index_part_bytes) {
             Ok(index_part) => {
-                return Ok(RemoteTimelineBlobData {
+                return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
                         index_part: Box::new(index_part),
                         index_part_generation,
@@ -444,7 +507,7 @@ pub(crate) async fn list_timeline_blobs(
                     },
                     unused_index_keys: index_part_keys,
                     unknown_keys,
-                })
+                }))
             }
             Err(index_parse_error) => errors.push(format!(
                 "index_part.json body parsing error: {index_parse_error}"
@@ -458,9 +521,9 @@ pub(crate) async fn list_timeline_blobs(
         );
     }
 
-    Ok(RemoteTimelineBlobData {
+    Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
         blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
-    })
+    }))
 }
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index ee133e2e58..ee816534c6 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -41,6 +41,10 @@ struct Cli {
     #[arg(long)]
     /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
     controller_jwt: Option<String>,
+
+    /// If set to true, the scrubber will exit with error code on fatal error.
+    #[arg(long, default_value_t = false)]
+    exit_code: bool,
 }
 
 #[derive(Subcommand, Debug)]
@@ -203,6 +207,7 @@ async fn main() -> anyhow::Result<()> {
                     tenant_ids,
                     json,
                     post_to_storcon,
+                    cli.exit_code,
                 )
                 .await
             }
@@ -269,6 +274,7 @@ async fn main() -> anyhow::Result<()> {
                 gc_min_age,
                 gc_mode,
                 post_to_storcon,
+                cli.exit_code,
             )
             .await
         }
@@ -284,6 +290,7 @@ pub async fn run_cron_job(
     gc_min_age: humantime::Duration,
     gc_mode: GcMode,
     post_to_storcon: bool,
+    exit_code: bool,
 ) -> anyhow::Result<()> {
     tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
     pageserver_physical_gc_cmd(
@@ -301,6 +308,7 @@ pub async fn run_cron_job(
         Vec::new(),
         true,
         post_to_storcon,
+        exit_code,
     )
     .await?;
 
@@ -349,6 +357,7 @@ pub async fn scan_pageserver_metadata_cmd(
     tenant_shard_ids: Vec<TenantShardId>,
     json: bool,
     post_to_storcon: bool,
+    exit_code: bool,
 ) -> anyhow::Result<()> {
     if controller_client.is_none() && post_to_storcon {
         return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
@@ -380,6 +389,9 @@ pub async fn scan_pageserver_metadata_cmd(
 
             if summary.is_fatal() {
                 tracing::error!("Fatal scrub errors detected");
+                if exit_code {
+                    std::process::exit(1);
+                }
             } else if summary.is_empty() {
                 // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                 // scrubber they were likely expecting to scan something, and if we see no timelines
@@ -391,6 +403,9 @@ pub async fn scan_pageserver_metadata_cmd(
                         .prefix_in_bucket
                         .unwrap_or("<none>".to_string())
                 );
+                if exit_code {
+                    std::process::exit(1);
+                }
             }
 
             Ok(())
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index 151ef27672..c1ea589f7f 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -12,6 +12,7 @@ use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
+use tracing::{info_span, Instrument};
 use utils::id::TenantId;
 use utils::shard::ShardCount;
 
@@ -169,45 +170,54 @@ pub async fn scan_pageserver_metadata(
         let mut timeline_ids = HashSet::new();
         let mut timeline_generations = HashMap::new();
         for (ttid, data) in timelines {
-            if ttid.tenant_shard_id.shard_count == highest_shard_count {
-                // Only analyze `TenantShardId`s with highest shard count.
+            async {
+                if ttid.tenant_shard_id.shard_count == highest_shard_count {
+                    // Only analyze `TenantShardId`s with highest shard count.
 
-                // Stash the generation of each timeline, for later use identifying orphan layers
-                if let BlobDataParseResult::Parsed {
-                    index_part,
-                    index_part_generation,
-                    s3_layers: _s3_layers,
-                } = &data.blob_data
-                {
-                    if index_part.deleted_at.is_some() {
-                        // skip deleted timeline.
-                        tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid);
-                        continue;
+                    // Stash the generation of each timeline, for later use identifying orphan layers
+                    if let BlobDataParseResult::Parsed {
+                        index_part,
+                        index_part_generation,
+                        s3_layers: _s3_layers,
+                    } = &data.blob_data
+                    {
+                        if index_part.deleted_at.is_some() {
+                            // skip deleted timeline.
+                            tracing::info!(
+                                "Skip analysis of {} b/c timeline is already deleted",
+                                ttid
+                            );
+                            return;
+                        }
+                        timeline_generations.insert(ttid, *index_part_generation);
                     }
-                    timeline_generations.insert(ttid, *index_part_generation);
+
+                    // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+                    // reference counts for layers across the tenant.
+                    let analysis = branch_cleanup_and_check_errors(
+                        remote_client,
+                        &ttid,
+                        &mut tenant_objects,
+                        None,
+                        None,
+                        Some(data),
+                    )
+                    .await;
+                    summary.update_analysis(&ttid, &analysis);
+
+                    timeline_ids.insert(ttid.timeline_id);
+                } else {
+                    tracing::info!(
+                        "Skip analysis of {} b/c a lower shard count than {}",
+                        ttid,
+                        highest_shard_count.0,
+                    );
                 }
-
-                // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
-                // reference counts for layers across the tenant.
-                let analysis = branch_cleanup_and_check_errors(
-                    remote_client,
-                    &ttid,
-                    &mut tenant_objects,
-                    None,
-                    None,
-                    Some(data),
-                )
-                .await;
-                summary.update_analysis(&ttid, &analysis);
-
-                timeline_ids.insert(ttid.timeline_id);
-            } else {
-                tracing::info!(
-                    "Skip analysis of {} b/c a lower shard count than {}",
-                    ttid,
-                    highest_shard_count.0,
-                );
             }
+            .instrument(
+                info_span!("analyze-timeline", shard = %ttid.tenant_shard_id.shard_slug(), timeline = %ttid.timeline_id),
+            )
+            .await
         }
 
         summary.timeline_count += timeline_ids.len();
@@ -278,6 +288,7 @@ pub async fn scan_pageserver_metadata(
                         timelines,
                         highest_shard_count,
                     )
+                    .instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
                     .await;
                     tenant_id = Some(ttid.tenant_shard_id.tenant_id);
                     highest_shard_count = ttid.tenant_shard_id.shard_count;
@@ -306,15 +317,18 @@ pub async fn scan_pageserver_metadata(
         tenant_timeline_results.push((ttid, data));
     }
 
+    let tenant_id = tenant_id.expect("Must be set if results are present");
+
     if !tenant_timeline_results.is_empty() {
         analyze_tenant(
             &remote_client,
-            tenant_id.expect("Must be set if results are present"),
+            tenant_id,
             &mut summary,
             tenant_objects,
             tenant_timeline_results,
             highest_shard_count,
         )
+        .instrument(info_span!("analyze-tenant", tenant = %tenant_id))
         .await;
     }
 

From 91d947654ec755820b0c7f74ea111d4865b17224 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 24 Sep 2024 09:44:45 +0200
Subject: [PATCH 127/142] Add regression tests for a cloud-based Neon instance
 (#8681)

## Problem
We need to be able to run the regression tests against a cloud-based
Neon staging instance to prepare the migration to the arm architecture.

## Summary of changes
Some tests were modified to work on the cloud instance (i.e. added
passwords, server-side copy changed to client-side, etc)

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/cloud-regress.yml           |  102 +
 patches/cloud_regress_pg16.patch              | 3949 +++++++++++++++++
 .../cloud_regress/test_cloud_regress.py       |  100 +
 test_runner/fixtures/utils.py                 |    2 +-
 4 files changed, 4152 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/cloud-regress.yml
 create mode 100644 patches/cloud_regress_pg16.patch
 create mode 100644 test_runner/cloud_regress/test_cloud_regress.py

diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
new file mode 100644
index 0000000000..de6babdde3
--- /dev/null
+++ b/.github/workflows/cloud-regress.yml
@@ -0,0 +1,102 @@
+name: Cloud Regression Test
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '45 1 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  regress:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 16
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    runs-on: us-east-2
+    container:
+      image: neondatabase/build-tools:pinned
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Patch the test
+        run: |
+          cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
+          patch -p1 < "../../patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+
+      - name: Generate a random password
+        id: pwgen
+        run: |
+          set +x
+          DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64)
+          echo "::add-mask::${DBPASS//\//}"
+          echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}"
+
+      - name: Change tests according to the generated password
+        env:
+          DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
+        run: |
+          cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
+          for fname in sql/*.sql expected/*.out; do
+            sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
+          done
+          for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do
+            USER=$(echo "${ph}" | cut -c 22-)
+            MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}')
+            sed -i.bak "s/${ph}/${MD5}/" expected/password.out
+          done
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+
+      - name: Run the regression tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ env.BUILD_TYPE }}
+          test_selection: cloud_regress
+          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          extra_params: -m remote_cluster
+        env:
+          BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
+
+      - name: Create Allure report
+        id: create-allure-report
+        if: ${{ !cancelled() }}
+        uses: ./.github/actions/allure-report-generate
+
+      - name: Post to a Slack channel
+        if: ${{ github.event.schedule && failure() }}
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: "C033QLM5P7D" # on-call-staging-stream
+          slack-message: |
+            Periodic pg_regress on staging: ${{ job.status }}
+            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+            <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
diff --git a/patches/cloud_regress_pg16.patch b/patches/cloud_regress_pg16.patch
new file mode 100644
index 0000000000..d15d0cffeb
--- /dev/null
+++ b/patches/cloud_regress_pg16.patch
@@ -0,0 +1,3949 @@
+diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
+index 0c24f6afe4..dd808ac2b4 100644
+--- a/src/test/regress/expected/aggregates.out
++++ b/src/test/regress/expected/aggregates.out
+@@ -11,7 +11,8 @@ CREATE TABLE aggtest (
+ 	b			float4
+ );
+ \set filename :abs_srcdir '/data/agg.data'
+-COPY aggtest FROM :'filename';
++\set command '\\copy aggtest FROM ' :'filename';
++:command
+ ANALYZE aggtest;
+ SELECT avg(four) AS avg_1 FROM onek;
+        avg_1        
+diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out
+index ae54cb254f..888e2ee8bc 100644
+--- a/src/test/regress/expected/alter_generic.out
++++ b/src/test/regress/expected/alter_generic.out
+@@ -15,9 +15,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user1;
+ DROP ROLE IF EXISTS regress_alter_generic_user2;
+ DROP ROLE IF EXISTS regress_alter_generic_user3;
+ RESET client_min_messages;
+-CREATE USER regress_alter_generic_user3;
+-CREATE USER regress_alter_generic_user2;
+-CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3;
++CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3;
+ CREATE SCHEMA alt_nsp1;
+ CREATE SCHEMA alt_nsp2;
+ GRANT ALL ON SCHEMA alt_nsp1, alt_nsp2 TO public;
+@@ -370,7 +370,7 @@ ERROR:  STORAGE cannot be specified in ALTER OPERATOR FAMILY
+ DROP OPERATOR FAMILY alt_opf4 USING btree;
+ -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user5 NOSUPERUSER;
++CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER;
+ CREATE OPERATOR FAMILY alt_opf5 USING btree;
+ SET ROLE regress_alter_generic_user5;
+ ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2);
+@@ -382,7 +382,7 @@ ERROR:  current transaction is aborted, commands ignored until end of transactio
+ ROLLBACK;
+ -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user6;
++CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA alt_nsp6;
+ REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6;
+ CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree;
+diff --git a/src/test/regress/expected/alter_operator.out b/src/test/regress/expected/alter_operator.out
+index 71bd484282..066ea4ec0d 100644
+--- a/src/test/regress/expected/alter_operator.out
++++ b/src/test/regress/expected/alter_operator.out
+@@ -127,7 +127,7 @@ ERROR:  operator attribute "Restrict" not recognized
+ --
+ -- Test permission check. Must be owner to ALTER OPERATOR.
+ --
+-CREATE USER regress_alter_op_user;
++CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_alter_op_user;
+ ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE);
+ ERROR:  must be owner of operator ===
+diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
+index 0e439a6488..393f316c3e 100644
+--- a/src/test/regress/expected/alter_table.out
++++ b/src/test/regress/expected/alter_table.out
+@@ -5,7 +5,7 @@
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_alter_table_user1;
+ RESET client_min_messages;
+-CREATE USER regress_alter_table_user1;
++CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ --
+ -- add attribute
+ --
+@@ -3896,8 +3896,8 @@ DROP TABLE fail_part;
+ ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ ERROR:  relation "nonexistent" does not exist
+ -- check ownership of the source table
+-CREATE ROLE regress_test_me;
+-CREATE ROLE regress_test_not_me;
++CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE not_owned_by_me (LIKE list_parted);
+ ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+ SET SESSION AUTHORIZATION regress_test_me;
+diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
+index 57a283dc59..9672d526b4 100644
+--- a/src/test/regress/expected/arrays.out
++++ b/src/test/regress/expected/arrays.out
+@@ -18,7 +18,8 @@ CREATE TABLE array_op_test (
+ 	t			text[]
+ );
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_op_test FROM :'filename';
++\set command '\\copy array_op_test FROM ' :'filename';
++:command
+ ANALYZE array_op_test;
+ --
+ -- only the 'e' array is 0-based, the others are 1-based.
+diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
+index 93ed5e8cc0..54bd7d535c 100644
+--- a/src/test/regress/expected/btree_index.out
++++ b/src/test/regress/expected/btree_index.out
+@@ -20,13 +20,17 @@ CREATE TABLE bt_f8_heap (
+ 	random 		int4
+ );
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_i4_heap FROM :'filename';
++\set command '\\copy bt_i4_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_name_heap FROM :'filename';
++\set command '\\copy bt_name_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_txt_heap FROM :'filename';
++\set command '\\copy bt_txt_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_f8_heap FROM :'filename';
++\set command '\\copy bt_f8_heap FROM ' :'filename';
++:command
+ ANALYZE bt_i4_heap;
+ ANALYZE bt_name_heap;
+ ANALYZE bt_txt_heap;
+diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out
+index 542c2e098c..0062d3024f 100644
+--- a/src/test/regress/expected/cluster.out
++++ b/src/test/regress/expected/cluster.out
+@@ -308,7 +308,7 @@ WHERE pg_class.oid=indexrelid
+ -- Verify that toast tables are clusterable
+ CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index;
+ -- Verify that clustering all tables does in fact cluster the right ones
+-CREATE USER regress_clstr_user;
++CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE clstr_1 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_2 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_3 (a INT PRIMARY KEY);
+@@ -497,7 +497,7 @@ DROP TABLE clstrpart;
+ CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i);
+ CREATE INDEX ptnowner_i_idx ON ptnowner(i);
+ CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1);
+-CREATE ROLE regress_ptnowner;
++CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2);
+ ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
+ ALTER TABLE ptnowner OWNER TO regress_ptnowner;
+diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
+index 97bbe53b64..eac3d42a79 100644
+--- a/src/test/regress/expected/collate.icu.utf8.out
++++ b/src/test/regress/expected/collate.icu.utf8.out
+@@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC';
+ 
+ reset enable_seqscan;
+ -- schema manipulation commands
+-CREATE ROLE regress_test_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA test_schema;
+ -- We need to do this this way to cope with varying names for encodings:
+ SET client_min_messages TO WARNING;
+diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out
+index cf0b80d616..e8e2a14a4a 100644
+--- a/src/test/regress/expected/constraints.out
++++ b/src/test/regress/expected/constraints.out
+@@ -349,7 +349,8 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT,
+ 	CONSTRAINT COPY_CON
+ 	CHECK (x > 3 AND y <> 'check failed' AND x < 7 ));
+ \set filename :abs_srcdir '/data/constro.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ SELECT * FROM COPY_TBL;
+  x |       y       | z 
+ ---+---------------+---
+@@ -358,7 +359,8 @@ SELECT * FROM COPY_TBL;
+ (2 rows)
+ 
+ \set filename :abs_srcdir '/data/constrf.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ ERROR:  new row for relation "copy_tbl" violates check constraint "copy_con"
+ DETAIL:  Failing row contains (7, check failed, 6).
+ CONTEXT:  COPY copy_tbl, line 2: "7	check failed	6"
+@@ -799,7 +801,7 @@ DETAIL:  Key (f1)=(3) conflicts with key (f1)=(3).
+ DROP TABLE deferred_excl;
+ -- Comments
+ -- Setup a low-level role to enforce non-superuser checks.
+-CREATE ROLE regress_constraint_comments;
++CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments;
+ CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0));
+ CREATE DOMAIN constraint_comments_dom AS int CONSTRAINT the_constraint CHECK (value > 0);
+@@ -819,7 +821,7 @@ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS NULL;
+ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL;
+ -- unauthorized user
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_constraint_comments_noaccess;
++CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments_noaccess;
+ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
+ ERROR:  must be owner of relation constraint_comments_tbl
+diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
+index 442e7aff2b..525f732b03 100644
+--- a/src/test/regress/expected/conversion.out
++++ b/src/test/regress/expected/conversion.out
+@@ -8,7 +8,7 @@
+ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+     AS :'regresslib', 'test_enc_conversion'
+     LANGUAGE C STRICT;
+-CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
++CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_conversion_user;
+ CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
+ --
+diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out
+index b48365ec98..a6ef910055 100644
+--- a/src/test/regress/expected/copy.out
++++ b/src/test/regress/expected/copy.out
+@@ -15,9 +15,11 @@ insert into copytest values('Unix',E'abc\ndef',2);
+ insert into copytest values('Mac',E'abc\rdef',3);
+ insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+ \set filename :abs_builddir '/results/copytest.csv'
+-copy copytest to :'filename' csv;
++\set command '\\copy copytest to ' :'filename' csv;
++:command
+ create temp table copytest2 (like copytest);
+-copy copytest2 from :'filename' csv;
++\set command '\\copy copytest2 from ' :'filename' csv;
++:command
+ select * from copytest except select * from copytest2;
+  style | test | filler 
+ -------+------+--------
+@@ -25,8 +27,10 @@ select * from copytest except select * from copytest2;
+ 
+ truncate copytest2;
+ --- same test but with an escape char different from quote char
+-copy copytest to :'filename' csv quote '''' escape E'\\';
+-copy copytest2 from :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
++\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ select * from copytest except select * from copytest2;
+  style | test | filler 
+ -------+------+--------
+@@ -66,13 +70,16 @@ insert into parted_copytest select x,1,'One' from generate_series(1,1000) x;
+ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x;
+ insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x;
+ \set filename :abs_builddir '/results/parted_copytest.csv'
+-copy (select * from parted_copytest order by a) to :'filename';
++\set command '\\copy (select * from parted_copytest order by a) to ' :'filename';
++:command
+ truncate parted_copytest;
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ -- Ensure COPY FREEZE errors for partitioned tables.
+ begin;
+ truncate parted_copytest;
+-copy parted_copytest from :'filename' (freeze);
++\set command '\\copy parted_copytest from ' :'filename' (freeze);
++:command
+ ERROR:  cannot perform COPY FREEZE on a partitioned table
+ rollback;
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+@@ -94,7 +101,8 @@ create trigger part_ins_trig
+ 	before insert on parted_copytest_a2
+ 	for each row
+ 	execute procedure part_ins_func();
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+ group by tableoid order by tableoid::regclass::name;
+       tableoid      | count |  sum   
+@@ -106,7 +114,8 @@ group by tableoid order by tableoid::regclass::name;
+ truncate table parted_copytest;
+ create index on parted_copytest (b);
+ drop trigger part_ins_trig on parted_copytest_a2;
+-copy parted_copytest from stdin;
++\set command '\\copy parted_copytest from ' stdin;
++:command
+ -- Ensure index entries were properly added during the copy.
+ select * from parted_copytest where b = 1;
+  a | b |  c   
+@@ -170,9 +179,9 @@ INFO:  progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progre
+ -- Generate COPY FROM report with FILE, with some excluded tuples.
+ truncate tab_progress_reporting;
+ \set filename :abs_srcdir '/data/emp.data'
+-copy tab_progress_reporting from :'filename'
+-	where (salary < 2000);
+-INFO:  progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": true, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true}
++\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)';
++:command
++INFO:  progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": false, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true}
+ drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
+ drop function notice_after_tab_progress_reporting();
+ drop table tab_progress_reporting;
+@@ -281,7 +290,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1);
+ -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org
+ -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY parted_si(id, data) FROM :'filename';
++\set command '\\COPY parted_si(id, data) FROM ' :'filename';
++:command
+ -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
+ -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
+ -- does so when shared_buffers is small enough.  To test if we encountered the
+diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
+index faf1a4d1b0..a44c97db52 100644
+--- a/src/test/regress/expected/copy2.out
++++ b/src/test/regress/expected/copy2.out
+@@ -553,8 +553,8 @@ select * from check_con_tbl;
+ (2 rows)
+ 
+ -- test with RLS enabled.
+-CREATE ROLE regress_rls_copy_user;
+-CREATE ROLE regress_rls_copy_user_colperms;
++CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ COPY rls_t1 (a, b, c) from stdin;
+ CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0);
+diff --git a/src/test/regress/expected/create_function_sql.out b/src/test/regress/expected/create_function_sql.out
+index 50aca5940f..42527142f6 100644
+--- a/src/test/regress/expected/create_function_sql.out
++++ b/src/test/regress/expected/create_function_sql.out
+@@ -4,7 +4,7 @@
+ -- Assorted tests using SQL-language functions
+ --
+ -- All objects made in this test are in temp_func_test schema
+-CREATE USER regress_unpriv_user;
++CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA temp_func_test;
+ GRANT ALL ON SCHEMA temp_func_test TO public;
+ SET search_path TO temp_func_test, public;
+diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
+index acfd9d1f4f..0eeb64e47a 100644
+--- a/src/test/regress/expected/create_index.out
++++ b/src/test/regress/expected/create_index.out
+@@ -51,7 +51,8 @@ CREATE TABLE fast_emp4000 (
+ 	home_base	 box
+ );
+ \set filename :abs_srcdir '/data/rect.data'
+-COPY slow_emp4000 FROM :'filename';
++\set command '\\copy slow_emp4000 FROM ' :'filename';
++:command
+ INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000;
+ ANALYZE slow_emp4000;
+ ANALYZE fast_emp4000;
+@@ -655,7 +656,8 @@ CREATE TABLE array_index_op_test (
+ 	t			text[]
+ );
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_index_op_test FROM :'filename';
++\set command '\\copy array_index_op_test FROM ' :'filename';
++:command
+ ANALYZE array_index_op_test;
+ SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno;
+  seqno |   i    |   t    
+@@ -2822,7 +2824,7 @@ END;
+ -- concurrently
+ REINDEX SCHEMA CONCURRENTLY schema_to_reindex;
+ -- Failure for unauthorized user
+-CREATE ROLE regress_reindexuser NOLOGIN;
++CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_reindexuser;
+ REINDEX SCHEMA schema_to_reindex;
+ ERROR:  must be owner of schema schema_to_reindex
+diff --git a/src/test/regress/expected/create_procedure.out b/src/test/regress/expected/create_procedure.out
+index 2177ba3509..ae3ca94d00 100644
+--- a/src/test/regress/expected/create_procedure.out
++++ b/src/test/regress/expected/create_procedure.out
+@@ -421,7 +421,7 @@ ERROR:  cp_testfunc1(integer) is not a procedure
+ DROP PROCEDURE nonexistent();
+ ERROR:  procedure nonexistent() does not exist
+ -- privileges
+-CREATE USER regress_cp_user1;
++CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT INSERT ON cp_test TO regress_cp_user1;
+ REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC;
+ SET ROLE regress_cp_user1;
+diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out
+index 46d4f9efe9..fc2a28a2f6 100644
+--- a/src/test/regress/expected/create_role.out
++++ b/src/test/regress/expected/create_role.out
+@@ -1,28 +1,28 @@
+ -- ok, superuser can create users with any set of privileges
+-CREATE ROLE regress_role_super SUPERUSER;
+-CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS;
++CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION;
+-CREATE ROLE regress_role_limited_admin CREATEROLE;
+-CREATE ROLE regress_role_normal;
++CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, CREATEROLE user can't give away role attributes without having them
+ SET SESSION AUTHORIZATION regress_role_limited_admin;
+-CREATE ROLE regress_nosuch_superuser SUPERUSER;
++CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the SUPERUSER attribute may create roles with the SUPERUSER attribute.
+-CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS;
++CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute.
+-CREATE ROLE regress_nosuch_replication REPLICATION;
++CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute.
+-CREATE ROLE regress_nosuch_bypassrls BYPASSRLS;
++CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the BYPASSRLS attribute may create roles with the BYPASSRLS attribute.
+-CREATE ROLE regress_nosuch_createdb CREATEDB;
++CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the CREATEDB attribute may create roles with the CREATEDB attribute.
+ -- ok, can create a role without any special attributes
+-CREATE ROLE regress_role_limited;
++CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, can't give it in any of the restricted attributes
+ ALTER ROLE regress_role_limited SUPERUSER;
+ ERROR:  permission denied to alter role
+@@ -39,10 +39,10 @@ DETAIL:  Only roles with the BYPASSRLS attribute may change the BYPASSRLS attrib
+ DROP ROLE regress_role_limited;
+ -- ok, can give away these role attributes if you have them
+ SET SESSION AUTHORIZATION regress_role_admin;
+-CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_replication REPLICATION;
+-CREATE ROLE regress_bypassrls BYPASSRLS;
+-CREATE ROLE regress_createdb CREATEDB;
++CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, can toggle these role attributes off and on if you have them
+ ALTER ROLE regress_replication NOREPLICATION;
+ ALTER ROLE regress_replication REPLICATION;
+@@ -58,48 +58,48 @@ ALTER ROLE regress_createdb NOSUPERUSER;
+ ERROR:  permission denied to alter role
+ DETAIL:  Only roles with the SUPERUSER attribute may change the SUPERUSER attribute.
+ -- ok, having CREATEROLE is enough to create users with these privileges
+-CREATE ROLE regress_createrole CREATEROLE NOINHERIT;
++CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION;
+-CREATE ROLE regress_login LOGIN;
+-CREATE ROLE regress_inherit INHERIT;
+-CREATE ROLE regress_connection_limit CONNECTION LIMIT 5;
+-CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo';
+-CREATE ROLE regress_password_null PASSWORD NULL;
++CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, backwards compatible noise words should be ignored
+-CREATE ROLE regress_noiseword SYSID 12345;
++CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ NOTICE:  SYSID can no longer be specified
+ -- fail, cannot grant membership in superuser role
+-CREATE ROLE regress_nosuch_super IN ROLE regress_role_super;
++CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to grant role "regress_role_super"
+ DETAIL:  Only roles with the SUPERUSER attribute may grant roles with the SUPERUSER attribute.
+ -- fail, database owner cannot have members
+-CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner;
++CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "pg_database_owner" cannot have explicit members
+ -- ok, can grant other users into a role
+ CREATE ROLE regress_inroles ROLE
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, cannot grant a role into itself
+-CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive;
++CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "regress_nosuch_recursive" is a member of role "regress_nosuch_recursive"
+ -- ok, can grant other users into a role with admin option
+ CREATE ROLE regress_adminroles ADMIN
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, cannot grant a role into itself with admin option
+-CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive;
++CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "regress_nosuch_admin_recursive" is a member of role "regress_nosuch_admin_recursive"
+ -- fail, regress_createrole does not have CREATEDB privilege
+ SET SESSION AUTHORIZATION regress_createrole;
+ CREATE DATABASE regress_nosuch_db;
+ ERROR:  permission denied to create database
+ -- ok, regress_createrole can create new roles
+-CREATE ROLE regress_plainrole;
++CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, roles with CREATEROLE can create new roles with it
+-CREATE ROLE regress_rolecreator CREATEROLE;
++CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, roles with CREATEROLE can create new roles with different role
+ -- attributes, including CREATEROLE
+-CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5;
++CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, we should be able to modify a role we created
+ COMMENT ON ROLE regress_hasprivs IS 'some comment';
+ ALTER ROLE regress_hasprivs RENAME TO regress_tenant;
+@@ -141,7 +141,7 @@ ERROR:  permission denied to reassign objects
+ DETAIL:  Only roles with privileges of role "regress_tenant" may reassign objects owned by it.
+ -- ok, create a role with a value for createrole_self_grant
+ SET createrole_self_grant = 'set, inherit';
+-CREATE ROLE regress_tenant2;
++CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_tenant2;
+ -- ok, regress_tenant2 can create objects within the database
+ SET SESSION AUTHORIZATION regress_tenant2;
+@@ -165,34 +165,34 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2;
+ ERROR:  must be able to SET ROLE "regress_tenant2"
+ DROP TABLE tenant2_table;
+ -- fail, CREATEROLE is not enough to create roles in privileged roles
+-CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data;
++CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data;
+ ERROR:  permission denied to grant role "pg_read_all_data"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_data" may grant this role.
+-CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data;
++CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data;
+ ERROR:  permission denied to grant role "pg_write_all_data"
+ DETAIL:  Only roles with the ADMIN option on role "pg_write_all_data" may grant this role.
+-CREATE ROLE regress_monitor IN ROLE pg_monitor;
++CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor;
+ ERROR:  permission denied to grant role "pg_monitor"
+ DETAIL:  Only roles with the ADMIN option on role "pg_monitor" may grant this role.
+-CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings;
++CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings;
+ ERROR:  permission denied to grant role "pg_read_all_settings"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_settings" may grant this role.
+-CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats;
++CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats;
+ ERROR:  permission denied to grant role "pg_read_all_stats"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_stats" may grant this role.
+-CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables;
++CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables;
+ ERROR:  permission denied to grant role "pg_stat_scan_tables"
+ DETAIL:  Only roles with the ADMIN option on role "pg_stat_scan_tables" may grant this role.
+-CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files;
++CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files;
+ ERROR:  permission denied to grant role "pg_read_server_files"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_server_files" may grant this role.
+-CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files;
++CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files;
+ ERROR:  permission denied to grant role "pg_write_server_files"
+ DETAIL:  Only roles with the ADMIN option on role "pg_write_server_files" may grant this role.
+-CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program;
++CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program;
+ ERROR:  permission denied to grant role "pg_execute_server_program"
+ DETAIL:  Only roles with the ADMIN option on role "pg_execute_server_program" may grant this role.
+-CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend;
++CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend;
+ ERROR:  permission denied to grant role "pg_signal_backend"
+ DETAIL:  Only roles with the ADMIN option on role "pg_signal_backend" may grant this role.
+ -- fail, role still owns database objects
+diff --git a/src/test/regress/expected/create_schema.out b/src/test/regress/expected/create_schema.out
+index 93302a07ef..1a73f083ac 100644
+--- a/src/test/regress/expected/create_schema.out
++++ b/src/test/regress/expected/create_schema.out
+@@ -2,7 +2,7 @@
+ -- CREATE_SCHEMA
+ --
+ -- Schema creation with elements.
+-CREATE ROLE regress_create_schema_role SUPERUSER;
++CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Cases where schema creation fails as objects are qualified with a schema
+ -- that does not match with what's expected.
+ -- This checks all the object types that include schema qualifications.
+diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
+index f3f8c7b5a2..3e3e54ff4c 100644
+--- a/src/test/regress/expected/create_view.out
++++ b/src/test/regress/expected/create_view.out
+@@ -18,7 +18,8 @@ CREATE TABLE real_city (
+ 	outline 	path
+ );
+ \set filename :abs_srcdir '/data/real_city.data'
+-COPY real_city FROM :'filename';
++\set command '\\copy real_city FROM ' :'filename';
++:command
+ ANALYZE real_city;
+ SELECT *
+    INTO TABLE ramp
+diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out
+index 454db91ec0..01378d7081 100644
+--- a/src/test/regress/expected/database.out
++++ b/src/test/regress/expected/database.out
+@@ -1,8 +1,7 @@
+ CREATE DATABASE regression_tbd
+ 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
+ ALTER DATABASE regression_tbd RENAME TO regression_utf8;
+-ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
+-ALTER DATABASE regression_utf8 RESET TABLESPACE;
++WARNING:  you need to manually restart any running background workers after this command
+ ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
+ -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
+ BEGIN;
+diff --git a/src/test/regress/expected/dependency.out b/src/test/regress/expected/dependency.out
+index 6d9498cdd1..692cf979d0 100644
+--- a/src/test/regress/expected/dependency.out
++++ b/src/test/regress/expected/dependency.out
+@@ -1,10 +1,10 @@
+ --
+ -- DEPENDENCIES
+ --
+-CREATE USER regress_dep_user;
+-CREATE USER regress_dep_user2;
+-CREATE USER regress_dep_user3;
+-CREATE GROUP regress_dep_group;
++CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE deptest (f1 serial primary key, f2 text);
+ GRANT SELECT ON TABLE deptest TO GROUP regress_dep_group;
+ GRANT ALL ON TABLE deptest TO regress_dep_user, regress_dep_user2;
+@@ -41,9 +41,9 @@ ERROR:  role "regress_dep_user3" cannot be dropped because some objects depend o
+ DROP TABLE deptest;
+ DROP USER regress_dep_user3;
+ -- Test DROP OWNED
+-CREATE USER regress_dep_user0;
+-CREATE USER regress_dep_user1;
+-CREATE USER regress_dep_user2;
++CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_dep_user0;
+ -- permission denied
+ DROP OWNED BY regress_dep_user1;
+diff --git a/src/test/regress/expected/drop_if_exists.out b/src/test/regress/expected/drop_if_exists.out
+index 5e44c2c3ce..eb3bb329fb 100644
+--- a/src/test/regress/expected/drop_if_exists.out
++++ b/src/test/regress/expected/drop_if_exists.out
+@@ -64,9 +64,9 @@ ERROR:  type "test_domain_exists" does not exist
+ ---
+ --- role/user/group
+ ---
+-CREATE USER regress_test_u1;
+-CREATE ROLE regress_test_r1;
+-CREATE GROUP regress_test_g1;
++CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ DROP USER regress_test_u2;
+ ERROR:  role "regress_test_u2" does not exist
+ DROP USER IF EXISTS regress_test_u1, regress_test_u2;
+diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out
+index 126f7047fe..0e2cc73426 100644
+--- a/src/test/regress/expected/equivclass.out
++++ b/src/test/regress/expected/equivclass.out
+@@ -384,7 +384,7 @@ set enable_nestloop = on;
+ set enable_mergejoin = off;
+ alter table ec1 enable row level security;
+ create policy p1 on ec1 using (f1 < '5'::int8alias1);
+-create user regress_user_ectest;
++create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select on ec0 to regress_user_ectest;
+ grant select on ec1 to regress_user_ectest;
+ -- without any RLS, we'll treat {a.ff, b.ff, 43} as an EquivalenceClass
+diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out
+index 5a10958df5..a578c06ebd 100644
+--- a/src/test/regress/expected/event_trigger.out
++++ b/src/test/regress/expected/event_trigger.out
+@@ -85,7 +85,7 @@ create event trigger regress_event_trigger2 on ddl_command_start
+ -- OK
+ comment on event trigger regress_event_trigger is 'test comment';
+ -- drop as non-superuser should fail
+-create role regress_evt_user;
++create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_evt_user;
+ create event trigger regress_event_trigger_noperms on ddl_command_start
+    execute procedure test_event_trigger();
+diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
+index 6ed50fdcfa..caa00a345d 100644
+--- a/src/test/regress/expected/foreign_data.out
++++ b/src/test/regress/expected/foreign_data.out
+@@ -14,13 +14,13 @@ CREATE FUNCTION test_fdw_handler()
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_role2, regress_test_role_super, regress_test_indirect, regress_unprivileged_role;
+ RESET client_min_messages;
+-CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER;
++CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_foreign_data_user';
+-CREATE ROLE regress_test_role;
+-CREATE ROLE regress_test_role2;
+-CREATE ROLE regress_test_role_super SUPERUSER;
+-CREATE ROLE regress_test_indirect;
+-CREATE ROLE regress_unprivileged_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE FOREIGN DATA WRAPPER dummy;
+ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
+ CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
+diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
+index 12e523c737..8872e23935 100644
+--- a/src/test/regress/expected/foreign_key.out
++++ b/src/test/regress/expected/foreign_key.out
+@@ -1968,7 +1968,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
+   FOR VALUES IN (1600);
+ -- leave these tables around intentionally
+ -- test the case when the referenced table is owned by a different user
+-create role regress_other_partitioned_fk_owner;
++create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner;
+ set role regress_other_partitioned_fk_owner;
+ create table other_partitioned_fk(a int, b int) partition by list (a);
+diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out
+index 0f623f7119..b48588a54e 100644
+--- a/src/test/regress/expected/generated.out
++++ b/src/test/regress/expected/generated.out
+@@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR
+ ALTER TABLE gtest10a DROP COLUMN b;
+ INSERT INTO gtest10a (a) VALUES (1);
+ -- privileges
+-CREATE USER regress_user11;
++CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED);
+ INSERT INTO gtest11s VALUES (1, 10), (2, 20);
+ GRANT SELECT (a, c) ON gtest11s TO regress_user11;
+diff --git a/src/test/regress/expected/guc.out b/src/test/regress/expected/guc.out
+index 127c953297..e6f8272f99 100644
+--- a/src/test/regress/expected/guc.out
++++ b/src/test/regress/expected/guc.out
+@@ -584,7 +584,7 @@ PREPARE foo AS SELECT 1;
+ LISTEN foo_event;
+ SET vacuum_cost_delay = 13;
+ CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS;
+-CREATE ROLE regress_guc_user;
++CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_guc_user;
+ -- look changes
+ SELECT pg_listening_channels();
+diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out
+index a2036a1597..805d73b9d2 100644
+--- a/src/test/regress/expected/hash_index.out
++++ b/src/test/regress/expected/hash_index.out
+@@ -20,10 +20,14 @@ CREATE TABLE hash_f8_heap (
+ 	random 		float8
+ );
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY hash_i4_heap FROM :'filename';
+-COPY hash_name_heap FROM :'filename';
+-COPY hash_txt_heap FROM :'filename';
+-COPY hash_f8_heap FROM :'filename';
++\set command '\\copy hash_i4_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_name_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_txt_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_f8_heap FROM ' :'filename';
++:command
+ -- the data in this file has a lot of duplicates in the index key
+ -- fields, leading to long bucket chains and lots of table expansion.
+ -- this is therefore a stress test of the bucket overflow code (unlike
+diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out
+index cc7772349f..98a08eb48d 100644
+--- a/src/test/regress/expected/identity.out
++++ b/src/test/regress/expected/identity.out
+@@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT;
+ ALTER TABLE itest7 ALTER COLUMN a RESTART;
+ ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY;
+ -- privileges
+-CREATE USER regress_identity_user1;
++CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
+ SET ROLE regress_identity_user1;
+diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
+index 4943429e9b..0257f22b15 100644
+--- a/src/test/regress/expected/inherit.out
++++ b/src/test/regress/expected/inherit.out
+@@ -2606,7 +2606,7 @@ create index on permtest_parent (left(c, 3));
+ insert into permtest_parent
+   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
+ analyze permtest_parent;
+-create role regress_no_child_access;
++create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ revoke all on permtest_grandchild from regress_no_child_access;
+ grant select on permtest_parent to regress_no_child_access;
+ set session authorization regress_no_child_access;
+diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
+index cf4b5221a8..fa6ccb639c 100644
+--- a/src/test/regress/expected/insert.out
++++ b/src/test/regress/expected/insert.out
+@@ -802,7 +802,7 @@ drop table mlparted5;
+ -- appropriate key description (or none) in various situations
+ create table key_desc (a int, b int) partition by list ((a+0));
+ create table key_desc_1 partition of key_desc for values in (1) partition by range (b);
+-create user regress_insert_other_user;
++create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select (a) on key_desc_1 to regress_insert_other_user;
+ grant insert on key_desc to regress_insert_other_user;
+ set role regress_insert_other_user;
+@@ -914,7 +914,7 @@ DETAIL:  Failing row contains (2, hi there).
+ -- check that the message shows the appropriate column description in a
+ -- situation where the partitioned table is not the primary ModifyTable node
+ create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int);
+-create role regress_coldesc_role;
++create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant insert on inserttest3 to regress_coldesc_role;
+ grant insert on brtrigpartcon to regress_coldesc_role;
+ revoke select on brtrigpartcon from regress_coldesc_role;
+diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
+index f8a7dac960..64dcaf171c 100644
+--- a/src/test/regress/expected/jsonb.out
++++ b/src/test/regress/expected/jsonb.out
+@@ -4,7 +4,8 @@ CREATE TABLE testjsonb (
+        j jsonb
+ );
+ \set filename :abs_srcdir '/data/jsonb.data'
+-COPY testjsonb FROM :'filename';
++\set command '\\copy testjsonb FROM ' :'filename';
++:command
+ -- Strings.
+ SELECT '""'::jsonb;				-- OK.
+  jsonb 
+diff --git a/src/test/regress/expected/largeobject.out b/src/test/regress/expected/largeobject.out
+index 4921dd79ae..d18a3cdd66 100644
+--- a/src/test/regress/expected/largeobject.out
++++ b/src/test/regress/expected/largeobject.out
+@@ -7,7 +7,7 @@
+ -- ensure consistent test output regardless of the default bytea format
+ SET bytea_output TO escape;
+ -- Test ALTER LARGE OBJECT OWNER
+-CREATE ROLE regress_lo_user;
++CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT lo_create(42);
+  lo_create 
+ -----------
+@@ -346,7 +346,8 @@ SELECT lo_unlink(loid) from lotest_stash_values;
+ 
+ TRUNCATE lotest_stash_values;
+ \set filename :abs_srcdir '/data/tenk.data'
+-INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename');
++\lo_import :filename
++INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID);
+ BEGIN;
+ UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer));
+ -- verify length of large object
+@@ -410,12 +411,8 @@ SELECT lo_close(fd) FROM lotest_stash_values;
+ 
+ END;
+ \set filename :abs_builddir '/results/lotest.txt'
+-SELECT lo_export(loid, :'filename') FROM lotest_stash_values;
+- lo_export 
+------------
+-         1
+-(1 row)
+-
++SELECT loid FROM lotest_stash_values \gset
++\lo_export :loid, :filename
+ \lo_import :filename
+ \set newloid :LASTOID
+ -- just make sure \lo_export does not barf
+diff --git a/src/test/regress/expected/lock.out b/src/test/regress/expected/lock.out
+index ad137d3645..8dac447436 100644
+--- a/src/test/regress/expected/lock.out
++++ b/src/test/regress/expected/lock.out
+@@ -16,7 +16,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2;
+ CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1;
+ CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a);
+ CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub;
+-CREATE ROLE regress_rol_lock1;
++CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1;
+ GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1;
+ -- Try all valid lock options; also try omitting the optional TABLE keyword.
+diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out
+index 67a50bde3d..7eeafd2603 100644
+--- a/src/test/regress/expected/matview.out
++++ b/src/test/regress/expected/matview.out
+@@ -549,7 +549,7 @@ SELECT * FROM mvtest_mv_v;
+ DROP TABLE mvtest_v CASCADE;
+ NOTICE:  drop cascades to materialized view mvtest_mv_v
+ -- make sure running as superuser works when MV owned by another role (bug #11208)
+-CREATE ROLE regress_user_mvtest;
++CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_user_mvtest;
+ -- this test case also checks for ambiguity in the queries issued by
+ -- refresh_by_match_merge(), by choosing column names that intentionally
+@@ -615,7 +615,7 @@ HINT:  Use the REFRESH MATERIALIZED VIEW command.
+ ROLLBACK;
+ -- INSERT privileges if relation owner is not allowed to insert.
+ CREATE SCHEMA matview_schema;
+-CREATE USER regress_matview_user;
++CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user
+   REVOKE INSERT ON TABLES FROM regress_matview_user;
+ GRANT ALL ON SCHEMA matview_schema TO public;
+diff --git a/src/test/regress/expected/merge.out b/src/test/regress/expected/merge.out
+index bc9a59803f..5b9ddf0626 100644
+--- a/src/test/regress/expected/merge.out
++++ b/src/test/regress/expected/merge.out
+@@ -1,9 +1,9 @@
+ --
+ -- MERGE
+ --
+-CREATE USER regress_merge_privs;
+-CREATE USER regress_merge_no_privs;
+-CREATE USER regress_merge_none;
++CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ DROP TABLE IF EXISTS target;
+ NOTICE:  table "target" does not exist, skipping
+ DROP TABLE IF EXISTS source;
+diff --git a/src/test/regress/expected/misc.out b/src/test/regress/expected/misc.out
+index 6e816c57f1..6ef45b468e 100644
+--- a/src/test/regress/expected/misc.out
++++ b/src/test/regress/expected/misc.out
+@@ -59,9 +59,11 @@ DROP TABLE tmp;
+ -- copy
+ --
+ \set filename :abs_builddir '/results/onek.data'
+-COPY onek TO :'filename';
++\set command '\\copy onek TO ' :'filename';
++:command
+ CREATE TEMP TABLE onek_copy (LIKE onek);
+-COPY onek_copy FROM :'filename';
++\set command '\\copy onek_copy FROM ' :'filename';
++:command
+ SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy;
+  unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 
+ ---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------
+@@ -73,9 +75,11 @@ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek;
+ (0 rows)
+ 
+ \set filename :abs_builddir '/results/stud_emp.data'
+-COPY BINARY stud_emp TO :'filename';
++\set command '\\COPY BINARY stud_emp TO ' :'filename';
++:command
+ CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp);
+-COPY BINARY stud_emp_copy FROM :'filename';
++\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename';
++:command
+ SELECT * FROM stud_emp_copy;
+  name  | age |  location  | salary | manager | gpa | percent 
+ -------+-----+------------+--------+---------+-----+---------
+diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
+index c669948370..47111b1d24 100644
+--- a/src/test/regress/expected/misc_functions.out
++++ b/src/test/regress/expected/misc_functions.out
+@@ -297,7 +297,7 @@ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity
+  t
+ (1 row)
+ 
+-CREATE ROLE regress_log_memory;
++CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT has_function_privilege('regress_log_memory',
+   'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no
+  has_function_privilege 
+@@ -483,7 +483,7 @@ select count(*) > 0 from
+ --
+ -- Test replication slot directory functions
+ --
+-CREATE ROLE regress_slot_dir_funcs;
++CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Not available by default.
+ SELECT has_function_privilege('regress_slot_dir_funcs',
+   'pg_ls_logicalsnapdir()', 'EXECUTE');
+diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out
+index fc42d418bf..e38f517574 100644
+--- a/src/test/regress/expected/object_address.out
++++ b/src/test/regress/expected/object_address.out
+@@ -5,7 +5,7 @@
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_addr_user;
+ RESET client_min_messages;
+-CREATE USER regress_addr_user;
++CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Test generic object addressing/identification functions
+ CREATE SCHEMA addr_nsp;
+ SET search_path TO 'addr_nsp';
+diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out
+index 8475231735..1afae5395f 100644
+--- a/src/test/regress/expected/password.out
++++ b/src/test/regress/expected/password.out
+@@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok
+ SET password_encryption = 'scram-sha-256'; -- ok
+ -- consistency of password entries
+ SET password_encryption = 'md5';
+-CREATE ROLE regress_passwd1 PASSWORD 'role_pwd1';
+-CREATE ROLE regress_passwd2 PASSWORD 'role_pwd2';
++CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET password_encryption = 'scram-sha-256';
+-CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
+-CREATE ROLE regress_passwd4 PASSWORD NULL;
++CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- check list of created entries
+ --
+ -- The scram secret will look something like:
+@@ -30,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+     ORDER BY rolname, rolpassword;
+      rolname     |                rolpassword_masked                 
+ -----------------+---------------------------------------------------
+- regress_passwd1 | md5783277baca28003b33453252be4dbb34
+- regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3
++ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd4 | 
++ regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+ (4 rows)
+ 
+ -- Rename a role
+@@ -54,24 +54,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+ -- passwords.
+ SET password_encryption = 'md5';
+ -- encrypt with MD5
+-ALTER ROLE regress_passwd2 PASSWORD 'foo';
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted, use as they are
+ ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ SET password_encryption = 'scram-sha-256';
+ -- create SCRAM secret
+-ALTER ROLE  regress_passwd4 PASSWORD 'foo';
++ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted with MD5, use as it is
+ CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- This looks like a valid SCRAM-SHA-256 secret, but it is not
+ -- so it should be hashed with SCRAM-SHA-256.
+ CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- These may look like valid MD5 secrets, but they are not, so they
+ -- should be hashed with SCRAM-SHA-256.
+ -- trailing garbage at the end
+ CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- invalid length
+ CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- Changing the SCRAM iteration count
+ SET scram_iterations = 1024;
+ CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount';
+@@ -81,63 +87,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+     ORDER BY rolname, rolpassword;
+      rolname     |                rolpassword_masked                 
+ -----------------+---------------------------------------------------
+- regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70
+- regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb
++ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023
+- regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd9 | SCRAM-SHA-256$1024:<salt>$<storedkey>:<serverkey>
+-(9 rows)
++(5 rows)
+ 
+ -- An empty password is not allowed, in any form
+ CREATE ROLE regress_passwd_empty PASSWORD '';
+ NOTICE:  empty string is not a valid password, clearing password
++ERROR:  Failed to get encrypted password: User "regress_passwd_empty" has no password assigned.
+ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a';
+-NOTICE:  empty string is not a valid password, clearing password
++ERROR:  role "regress_passwd_empty" does not exist
+ ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4=';
+-NOTICE:  empty string is not a valid password, clearing password
++ERROR:  role "regress_passwd_empty" does not exist
+ SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty';
+  rolpassword 
+ -------------
+- 
+-(1 row)
++(0 rows)
+ 
+ -- Test with invalid stored and server keys.
+ --
+ -- The first is valid, to act as a control. The others have too long
+ -- stored/server keys. They will be re-hashed.
+ CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- Check that the invalid secrets were re-hashed. A re-hashed secret
+ -- should not contain the original salt.
+ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed
+     FROM pg_authid
+     WHERE rolname LIKE 'regress_passwd_sha_len%'
+     ORDER BY rolname;
+-         rolname         | is_rolpassword_rehashed 
+--------------------------+-------------------------
+- regress_passwd_sha_len0 | f
+- regress_passwd_sha_len1 | t
+- regress_passwd_sha_len2 | t
+-(3 rows)
++ rolname | is_rolpassword_rehashed 
++---------+-------------------------
++(0 rows)
+ 
+ DROP ROLE regress_passwd1;
+ DROP ROLE regress_passwd2;
+ DROP ROLE regress_passwd3;
+ DROP ROLE regress_passwd4;
+ DROP ROLE regress_passwd5;
++ERROR:  role "regress_passwd5" does not exist
+ DROP ROLE regress_passwd6;
++ERROR:  role "regress_passwd6" does not exist
+ DROP ROLE regress_passwd7;
++ERROR:  role "regress_passwd7" does not exist
+ DROP ROLE regress_passwd8;
++ERROR:  role "regress_passwd8" does not exist
+ DROP ROLE regress_passwd9;
+ DROP ROLE regress_passwd_empty;
++ERROR:  role "regress_passwd_empty" does not exist
+ DROP ROLE regress_passwd_sha_len0;
++ERROR:  role "regress_passwd_sha_len0" does not exist
+ DROP ROLE regress_passwd_sha_len1;
++ERROR:  role "regress_passwd_sha_len1" does not exist
+ DROP ROLE regress_passwd_sha_len2;
++ERROR:  role "regress_passwd_sha_len2" does not exist
+ -- all entries should have been removed
+ SELECT rolname, rolpassword
+     FROM pg_authid
+diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
+index fbb0489a4f..2905194e2c 100644
+--- a/src/test/regress/expected/privileges.out
++++ b/src/test/regress/expected/privileges.out
+@@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
+ 
+ RESET client_min_messages;
+ -- test proper begins here
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
+-CREATE USER regress_priv_user5;	-- duplicate
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;	-- duplicate
+ ERROR:  role "regress_priv_user5" already exists
+-CREATE USER regress_priv_user6;
+-CREATE USER regress_priv_user7;
+-CREATE USER regress_priv_user8;
+-CREATE USER regress_priv_user9;
+-CREATE USER regress_priv_user10;
+-CREATE ROLE regress_priv_role;
++CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- circular ADMIN OPTION grants should be disallowed
+ GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION;
+ GRANT regress_priv_user1 TO regress_priv_user3 WITH ADMIN OPTION GRANTED BY regress_priv_user2;
+@@ -108,11 +108,11 @@ ERROR:  role "regress_priv_user5" cannot be dropped because some objects depend
+ DETAIL:  privileges for membership of role regress_priv_user6 in role regress_priv_user1
+ DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order
+ -- recreate the roles we just dropped
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT pg_read_all_data TO regress_priv_user6;
+ GRANT pg_write_all_data TO regress_priv_user7;
+ GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION;
+@@ -145,8 +145,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8;
+ DROP USER regress_priv_user10;
+ DROP USER regress_priv_user9;
+ DROP USER regress_priv_user8;
+-CREATE GROUP regress_priv_group1;
+-CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2;
++CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
+ ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
+ GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
+ SET SESSION AUTHORIZATION regress_priv_user1;
+@@ -172,12 +172,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+ ERROR:  permission denied to grant privileges as role "regress_priv_role"
+ DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
+ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
++ERROR:  permission denied to grant privileges as role "neondb_owner"
++DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY foo; -- error
+ ERROR:  role "foo" does not exist
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY regress_priv_user2; -- warning, noop
+ WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "regress_priv_user2"
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_USER;
++WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner"
+ REVOKE regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_ROLE;
++WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner"
+ DROP ROLE regress_priv_role;
+ SET SESSION AUTHORIZATION regress_priv_user1;
+ SELECT session_user, current_user;
+@@ -1709,7 +1713,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+ 
+ -- security-restricted operations
+ \c -
+-CREATE ROLE regress_sro_user;
++CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Check that index expressions and predicates are run as the table's owner
+ -- A dummy index function checking current_user
+ CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
+@@ -2601,8 +2605,8 @@ drop cascades to function testns.priv_testagg(integer)
+ drop cascades to function testns.priv_testproc(integer)
+ -- Change owner of the schema & and rename of new schema owner
+ \c -
+-CREATE ROLE regress_schemauser1 superuser login;
+-CREATE ROLE regress_schemauser2 superuser login;
++CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_schemauser1;
+ CREATE SCHEMA testns;
+ SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
+@@ -2725,7 +2729,7 @@ DROP USER regress_priv_user7;
+ DROP USER regress_priv_user8; -- does not exist
+ ERROR:  role "regress_priv_user8" does not exist
+ -- permissions with LOCK TABLE
+-CREATE USER regress_locktable_user;
++CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE lock_table (a int);
+ -- LOCK TABLE and SELECT permission
+ GRANT SELECT ON lock_table TO regress_locktable_user;
+@@ -2807,7 +2811,7 @@ DROP USER regress_locktable_user;
+ -- pg_backend_memory_contexts.
+ -- switch to superuser
+ \c -
+-CREATE ROLE regress_readallstats;
++CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
+  has_table_privilege 
+ ---------------------
+@@ -2851,10 +2855,10 @@ RESET ROLE;
+ -- clean up
+ DROP ROLE regress_readallstats;
+ -- test role grantor machinery
+-CREATE ROLE regress_group;
+-CREATE ROLE regress_group_direct_manager;
+-CREATE ROLE regress_group_indirect_manager;
+-CREATE ROLE regress_group_member;
++CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
+ GRANT regress_group_direct_manager TO regress_group_indirect_manager;
+ SET SESSION AUTHORIZATION regress_group_direct_manager;
+@@ -2883,9 +2887,9 @@ DROP ROLE regress_group_direct_manager;
+ DROP ROLE regress_group_indirect_manager;
+ DROP ROLE regress_group_member;
+ -- test SET and INHERIT options with object ownership changes
+-CREATE ROLE regress_roleoption_protagonist;
+-CREATE ROLE regress_roleoption_donor;
+-CREATE ROLE regress_roleoption_recipient;
++CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA regress_roleoption;
+ GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
+ GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
+diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
+index 7cd0c27cca..d7a124ed68 100644
+--- a/src/test/regress/expected/psql.out
++++ b/src/test/regress/expected/psql.out
+@@ -2857,7 +2857,7 @@ Type                | func
+ -- check conditional am display
+ \pset expanded off
+ CREATE SCHEMA tableam_display;
+-CREATE ROLE regress_display_role;
++CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER SCHEMA tableam_display OWNER TO regress_display_role;
+ SET search_path TO tableam_display;
+ CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler;
+@@ -4808,7 +4808,7 @@ last error message: division by zero
+ last error code: 22012
+ \unset FETCH_COUNT
+ create schema testpart;
+-create role regress_partitioning_role;
++create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ alter schema testpart owner to regress_partitioning_role;
+ set role to regress_partitioning_role;
+ -- run test inside own schema and hide other partitions
+@@ -5260,7 +5260,7 @@ reset work_mem;
+ 
+ -- check \df+
+ -- we have to use functions with a predictable owner name, so make a role
+-create role regress_psql_user superuser;
++create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ begin;
+ set session authorization regress_psql_user;
+ create function psql_df_internal (float8)
+@@ -5544,11 +5544,14 @@ CREATE TEMPORARY TABLE reload_output(
+   line text
+ );
+ SELECT 1 AS a \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+   line   
+ ---------
+@@ -5587,13 +5590,15 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c;
+ -- COPY TO file
+ -- The data goes to :g_out_file and the status to :o_out_file
+ \set QUIET false
+-COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file';
++\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file';
++:command
+ -- DML command status
+ UPDATE onek SET unique1 = unique1 WHERE false;
+ \set QUIET true
+ \o
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -5610,7 +5615,8 @@ SELECT line FROM reload_output ORDER BY lineno;
+ (10 rows)
+ 
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+    line   
+ ----------
+@@ -5647,7 +5653,8 @@ COPY (SELECT 'foo1') TO STDOUT \; COPY (SELECT 'bar1') TO STDOUT;
+ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file
+ \o
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -5656,7 +5663,8 @@ SELECT line FROM reload_output ORDER BY lineno;
+ (2 rows)
+ 
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -6619,10 +6627,10 @@ cross-database references are not implemented: "no.such.database"."no.such.schem
+ \dX "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ cross-database references are not implemented: "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ -- check \drg and \du
+-CREATE ROLE regress_du_role0;
+-CREATE ROLE regress_du_role1;
+-CREATE ROLE regress_du_role2;
+-CREATE ROLE regress_du_admin;
++CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role2 TO regress_du_admin WITH ADMIN TRUE;
+diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out
+index 69dc6cfd85..68390cc18a 100644
+--- a/src/test/regress/expected/publication.out
++++ b/src/test/regress/expected/publication.out
+@@ -1,9 +1,9 @@
+ --
+ -- PUBLICATION
+ --
+-CREATE ROLE regress_publication_user LOGIN SUPERUSER;
+-CREATE ROLE regress_publication_user2;
+-CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_publication_user';
+ -- suppress warning that depends on wal_level
+ SET client_min_messages = 'ERROR';
+@@ -1211,7 +1211,7 @@ ALTER PUBLICATION testpub2 ADD TABLE testpub_tbl1;  -- ok
+ DROP PUBLICATION testpub2;
+ DROP PUBLICATION testpub3;
+ SET ROLE regress_publication_user;
+-CREATE ROLE regress_publication_user3;
++CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_publication_user2 TO regress_publication_user3;
+ SET client_min_messages = 'ERROR';
+ CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test;
+diff --git a/src/test/regress/expected/regproc.out b/src/test/regress/expected/regproc.out
+index a9420850b8..bd3b5f312d 100644
+--- a/src/test/regress/expected/regproc.out
++++ b/src/test/regress/expected/regproc.out
+@@ -2,7 +2,7 @@
+ -- regproc
+ --
+ /* If objects exist, return oids */
+-CREATE ROLE regress_regrole_test;
++CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- without schemaname
+ SELECT regoper('||/');
+  regoper 
+diff --git a/src/test/regress/expected/roleattributes.out b/src/test/regress/expected/roleattributes.out
+index 5e6969b173..2c4d52237f 100644
+--- a/src/test/regress/expected/roleattributes.out
++++ b/src/test/regress/expected/roleattributes.out
+@@ -1,233 +1,233 @@
+ -- default for superuser is false
+-CREATE ROLE regress_test_def_superuser;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_superuser WITH NOSUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for inherit is true
+-CREATE ROLE regress_test_def_inherit;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_inherit WITH INHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for create role is false
+-CREATE ROLE regress_test_def_createrole;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
+-           rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
++           rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createrole WITH NOCREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for create database is false
+-CREATE ROLE regress_test_def_createdb;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
+-          rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
++          rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createdb WITH NOCREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for can login is false for role
+-CREATE ROLE regress_test_def_role_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
+-            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
++            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_role_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for can login is true for user
+-CREATE USER regress_test_def_user_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
+-            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
++            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER USER regress_test_user_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for replication is false
+-CREATE ROLE regress_test_def_replication;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
+-           rolname            | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_replication | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
++           rolname            | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_replication | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 |             | 
++CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_replication WITH NOREPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for bypassrls is false
+-CREATE ROLE regress_test_def_bypassrls;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 |             | 
++CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- clean up roles
+diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out
+index 97ca9bf72c..b2a7a6f710 100644
+--- a/src/test/regress/expected/rowsecurity.out
++++ b/src/test/regress/expected/rowsecurity.out
+@@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2;
+ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
+ RESET client_min_messages;
+ -- initial setup
+-CREATE USER regress_rls_alice NOLOGIN;
+-CREATE USER regress_rls_bob NOLOGIN;
+-CREATE USER regress_rls_carol NOLOGIN;
+-CREATE USER regress_rls_dave NOLOGIN;
+-CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN;
+-CREATE ROLE regress_rls_group1 NOLOGIN;
+-CREATE ROLE regress_rls_group2 NOLOGIN;
++CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_rls_group1 TO regress_rls_bob;
+ GRANT regress_rls_group2 TO regress_rls_carol;
+ CREATE SCHEMA regress_rls_schema;
+@@ -4352,8 +4352,8 @@ SELECT count(*) = 0 FROM pg_depend
+ 
+ -- DROP OWNED BY testing
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_rls_dob_role1;
+-CREATE ROLE regress_rls_dob_role2;
++CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE dob_t1 (c1 int);
+ CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1);
+ CREATE POLICY p1 ON dob_t1 TO regress_rls_dob_role1 USING (true);
+diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
+index 09a255649b..15895f0c53 100644
+--- a/src/test/regress/expected/rules.out
++++ b/src/test/regress/expected/rules.out
+@@ -3708,7 +3708,7 @@ DROP TABLE ruletest2;
+ -- Test non-SELECT rule on security invoker view.
+ -- Should use view owner's permissions.
+ --
+-CREATE USER regress_rule_user1;
++CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ruletest_t1 (x int);
+ CREATE TABLE ruletest_t2 (x int);
+ CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS
+diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out
+index a8e01a6220..5a9cef4ede 100644
+--- a/src/test/regress/expected/security_label.out
++++ b/src/test/regress/expected/security_label.out
+@@ -6,8 +6,8 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_seclabel_user1;
+ DROP ROLE IF EXISTS regress_seclabel_user2;
+ RESET client_min_messages;
+-CREATE USER regress_seclabel_user1 WITH CREATEROLE;
+-CREATE USER regress_seclabel_user2;
++CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE seclabel_tbl1 (a int, b text);
+ CREATE TABLE seclabel_tbl2 (x int, y text);
+ CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2;
+@@ -19,21 +19,21 @@ ALTER TABLE seclabel_tbl2 OWNER TO regress_seclabel_user2;
+ -- Test of SECURITY LABEL statement without a plugin
+ --
+ SECURITY LABEL ON TABLE seclabel_tbl1 IS 'classified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL FOR 'dummy' ON TABLE seclabel_tbl1 IS 'classified';		-- fail
+ ERROR:  security label provider "dummy" is not loaded
+ SECURITY LABEL ON TABLE seclabel_tbl1 IS '...invalid label...';		-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL ON TABLE seclabel_tbl3 IS 'unclassified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL ON ROLE regress_seclabel_user1 IS 'classified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL FOR 'dummy' ON ROLE regress_seclabel_user1 IS 'classified';		-- fail
+ ERROR:  security label provider "dummy" is not loaded
+ SECURITY LABEL ON ROLE regress_seclabel_user1 IS '...invalid label...';		-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL ON ROLE regress_seclabel_user3 IS 'unclassified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ -- clean up objects
+ DROP FUNCTION seclabel_four();
+ DROP DOMAIN seclabel_domain;
+diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out
+index b79fe9a1c0..e29fab88ab 100644
+--- a/src/test/regress/expected/select_into.out
++++ b/src/test/regress/expected/select_into.out
+@@ -15,7 +15,7 @@ DROP TABLE sitmp1;
+ -- SELECT INTO and INSERT permission, if owner is not allowed to insert.
+ --
+ CREATE SCHEMA selinto_schema;
+-CREATE USER regress_selinto_user;
++CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
+ 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
+ GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out
+index 1aeed8452b..7d9427d070 100644
+--- a/src/test/regress/expected/select_views.out
++++ b/src/test/regress/expected/select_views.out
+@@ -1250,7 +1250,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
+ --
+ -- Test for Leaky view scenario
+ --
+-CREATE ROLE regress_alice;
++CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE FUNCTION f_leak (text)
+        RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+        AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END';
+diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out
+index f02f020542..c9e0fda350 100644
+--- a/src/test/regress/expected/sequence.out
++++ b/src/test/regress/expected/sequence.out
+@@ -22,7 +22,7 @@ CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid;  -- not a table
+ ERROR:  sequence cannot be owned by relation "pg_class_oid_index"
+ DETAIL:  This operation is not supported for indexes.
+ CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname;  -- not same schema
+-ERROR:  sequence must be in same schema as table it is linked to
++ERROR:  sequence must have same owner as table it is linked to
+ CREATE TABLE sequence_test_table (a int);
+ CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b;  -- wrong column
+ ERROR:  column "b" of relation "sequence_test_table" does not exist
+@@ -639,7 +639,7 @@ SELECT setval('sequence_test2', 1);  -- error
+ ERROR:  cannot execute setval() in a read-only transaction
+ ROLLBACK;
+ -- privileges tests
+-CREATE USER regress_seq_user;
++CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- nextval
+ BEGIN;
+ SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out
+index 94187e59cf..72346e2c71 100644
+--- a/src/test/regress/expected/stats.out
++++ b/src/test/regress/expected/stats.out
+@@ -1283,37 +1283,6 @@ SELECT current_setting('fsync') = 'off'
+  t
+ (1 row)
+ 
+--- Change the tablespace so that the table is rewritten directly, then SELECT
+--- from it to cause it to be read back into shared buffers.
+-SELECT sum(reads) AS io_sum_shared_before_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+--- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly
+--- rewritten table, e.g. by autovacuum.
+-BEGIN;
+-ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace;
+--- SELECT from the table so that the data is read into shared buffers and
+--- context 'normal', object 'relation' reads are counted.
+-SELECT COUNT(*) FROM test_io_shared;
+- count 
+--------
+-   100
+-(1 row)
+-
+-COMMIT;
+-SELECT pg_stat_force_next_flush();
+- pg_stat_force_next_flush 
+---------------------------
+- 
+-(1 row)
+-
+-SELECT sum(reads) AS io_sum_shared_after_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation'  \gset
+-SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ SELECT sum(hits) AS io_sum_shared_before_hits
+   FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+ -- Select from the table again to count hits.
+@@ -1415,6 +1384,7 @@ SELECT :io_sum_local_after_evictions > :io_sum_local_before_evictions,
+ -- local buffers, exercising a different codepath than standard local buffer
+ -- writes.
+ ALTER TABLE test_io_local SET TABLESPACE regress_tblspace;
++ERROR:  tablespace "regress_tblspace" does not exist
+ SELECT pg_stat_force_next_flush();
+  pg_stat_force_next_flush 
+ --------------------------
+@@ -1426,7 +1396,7 @@ SELECT sum(writes) AS io_sum_local_new_tblspc_writes
+ SELECT :io_sum_local_new_tblspc_writes > :io_sum_local_after_writes;
+  ?column? 
+ ----------
+- t
++ f
+ (1 row)
+ 
+ RESET temp_buffers;
+diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
+index b4c85613de..d32a9a69ad 100644
+--- a/src/test/regress/expected/stats_ext.out
++++ b/src/test/regress/expected/stats_ext.out
+@@ -70,7 +70,7 @@ DROP TABLE ext_stats_test;
+ CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
+ CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment';
+-CREATE ROLE regress_stats_ext;
++CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_stats_ext;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment';
+ ERROR:  must be owner of statistics object ab1_a_b_stats
+@@ -3214,7 +3214,7 @@ set search_path to public, stts_s1;
+  stts_s1 | stts_foo               | col1, col2 FROM stts_t3                                          | defined   | defined      | defined
+ (10 rows)
+ 
+-create role regress_stats_ext nosuperuser;
++create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_stats_ext;
+ \dX
+                                                        List of extended statistics
+@@ -3237,7 +3237,7 @@ drop schema stts_s1, stts_s2 cascade;
+ drop user regress_stats_ext;
+ reset search_path;
+ -- User with no access
+-CREATE USER regress_stats_user1;
++CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT USAGE ON SCHEMA tststats TO regress_stats_user1;
+ SET SESSION AUTHORIZATION regress_stats_user1;
+ SELECT * FROM tststats.priv_test_tbl; -- Permission denied
+diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out
+index b15eddbff3..e9ba4568eb 100644
+--- a/src/test/regress/expected/subscription.out
++++ b/src/test/regress/expected/subscription.out
+@@ -1,10 +1,10 @@
+ --
+ -- SUBSCRIPTION
+ --
+-CREATE ROLE regress_subscription_user LOGIN SUPERUSER;
+-CREATE ROLE regress_subscription_user2;
+-CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription;
+-CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription;
++CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_subscription_user';
+ -- fail - no publications
+ CREATE SUBSCRIPTION regress_testsub CONNECTION 'foo';
+diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out
+index 5d9e6bf12b..c5fddfdca6 100644
+--- a/src/test/regress/expected/test_setup.out
++++ b/src/test/regress/expected/test_setup.out
+@@ -21,6 +21,7 @@ GRANT ALL ON SCHEMA public TO public;
+ -- Create a tablespace we can use in tests.
+ SET allow_in_place_tablespaces = true;
+ CREATE TABLESPACE regress_tblspace LOCATION '';
++ERROR:  CREATE TABLESPACE is not supported on Neon
+ --
+ -- These tables have traditionally been referenced by many tests,
+ -- so create and populate them.  Insert only non-error values here.
+@@ -111,7 +112,8 @@ CREATE TABLE onek (
+ 	string4		name
+ );
+ \set filename :abs_srcdir '/data/onek.data'
+-COPY onek FROM :'filename';
++\set command '\\copy onek FROM ' :'filename';
++:command
+ VACUUM ANALYZE onek;
+ CREATE TABLE onek2 AS SELECT * FROM onek;
+ VACUUM ANALYZE onek2;
+@@ -134,7 +136,8 @@ CREATE TABLE tenk1 (
+ 	string4		name
+ );
+ \set filename :abs_srcdir '/data/tenk.data'
+-COPY tenk1 FROM :'filename';
++\set command '\\copy tenk1 FROM ' :'filename';
++:command
+ VACUUM ANALYZE tenk1;
+ CREATE TABLE tenk2 AS SELECT * FROM tenk1;
+ VACUUM ANALYZE tenk2;
+@@ -144,20 +147,23 @@ CREATE TABLE person (
+ 	location 	point
+ );
+ \set filename :abs_srcdir '/data/person.data'
+-COPY person FROM :'filename';
++\set command '\\copy person FROM ' :'filename';
++:command
+ VACUUM ANALYZE person;
+ CREATE TABLE emp (
+ 	salary 		int4,
+ 	manager 	name
+ ) INHERITS (person);
+ \set filename :abs_srcdir '/data/emp.data'
+-COPY emp FROM :'filename';
++\set command '\\copy emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE emp;
+ CREATE TABLE student (
+ 	gpa 		float8
+ ) INHERITS (person);
+ \set filename :abs_srcdir '/data/student.data'
+-COPY student FROM :'filename';
++\set command '\\copy student FROM ' :'filename';
++:command
+ VACUUM ANALYZE student;
+ CREATE TABLE stud_emp (
+ 	percent 	int4
+@@ -166,14 +172,16 @@ NOTICE:  merging multiple inherited definitions of column "name"
+ NOTICE:  merging multiple inherited definitions of column "age"
+ NOTICE:  merging multiple inherited definitions of column "location"
+ \set filename :abs_srcdir '/data/stud_emp.data'
+-COPY stud_emp FROM :'filename';
++\set command '\\copy stud_emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE stud_emp;
+ CREATE TABLE road (
+ 	name		text,
+ 	thepath 	path
+ );
+ \set filename :abs_srcdir '/data/streets.data'
+-COPY road FROM :'filename';
++\set command '\\copy road FROM ' :'filename';
++:command
+ VACUUM ANALYZE road;
+ CREATE TABLE ihighway () INHERITS (road);
+ INSERT INTO ihighway
+diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
+index 9fad6c8b04..a1b8e82389 100644
+--- a/src/test/regress/expected/tsearch.out
++++ b/src/test/regress/expected/tsearch.out
+@@ -63,7 +63,8 @@ CREATE TABLE test_tsvector(
+ 	a tsvector
+ );
+ \set filename :abs_srcdir '/data/tsearch.data'
+-COPY test_tsvector FROM :'filename';
++\set command '\\copy test_tsvector FROM ' :'filename';
++:command
+ ANALYZE test_tsvector;
+ -- test basic text search behavior without indexes, then with
+ SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
+diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out
+index ba46c32029..eac3017bac 100644
+--- a/src/test/regress/expected/updatable_views.out
++++ b/src/test/regress/expected/updatable_views.out
+@@ -999,9 +999,9 @@ NOTICE:  drop cascades to 2 other objects
+ DETAIL:  drop cascades to view rw_view1
+ drop cascades to function rw_view1_aa(rw_view1)
+ -- permissions checks
+-CREATE USER regress_view_user1;
+-CREATE USER regress_view_user2;
+-CREATE USER regress_view_user3;
++CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_view_user1;
+ CREATE TABLE base_tbl(a int, b text, c float);
+ INSERT INTO base_tbl VALUES (1, 'Row 1', 1.0);
+@@ -3094,8 +3094,8 @@ DETAIL:  View columns that are not columns of their base relation are not updata
+ drop view uv_iocu_view;
+ drop table uv_iocu_tab;
+ -- ON CONFLICT DO UPDATE permissions checks
+-create user regress_view_user1;
+-create user regress_view_user2;
++create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set session authorization regress_view_user1;
+ create table base_tbl(a int unique, b text, c float);
+ insert into base_tbl values (1,'xxx',1.0);
+diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
+index c809f88f54..d1d57852d4 100644
+--- a/src/test/regress/expected/update.out
++++ b/src/test/regress/expected/update.out
+@@ -602,7 +602,7 @@ DROP FUNCTION func_parted_mod_b();
+ -- RLS policies with update-row-movement
+ -----------------------------------------
+ ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+-CREATE USER regress_range_parted_user;
++CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+ CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+ CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out
+index 4aaf4f025d..40a339758a 100644
+--- a/src/test/regress/expected/vacuum.out
++++ b/src/test/regress/expected/vacuum.out
+@@ -433,7 +433,7 @@ CREATE TABLE vacowned (a int);
+ CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a);
+ CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1);
+ CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2);
+-CREATE ROLE regress_vacuum;
++CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_vacuum;
+ -- Simple table
+ VACUUM vacowned;
+diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
+index 3d14bf4e4f..87f351b1d1 100644
+--- a/src/test/regress/parallel_schedule
++++ b/src/test/regress/parallel_schedule
+@@ -130,4 +130,4 @@ test: fast_default
+ 
+ # run tablespace test at the end because it drops the tablespace created during
+ # setup that other tests may use.
+-test: tablespace
++#test: tablespace
+diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
+index f51726e8ed..8854104eff 100644
+--- a/src/test/regress/sql/aggregates.sql
++++ b/src/test/regress/sql/aggregates.sql
+@@ -15,7 +15,8 @@ CREATE TABLE aggtest (
+ );
+ 
+ \set filename :abs_srcdir '/data/agg.data'
+-COPY aggtest FROM :'filename';
++\set command '\\copy aggtest FROM ' :'filename';
++:command
+ 
+ ANALYZE aggtest;
+ 
+diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql
+index de58d268d3..9d38df7f42 100644
+--- a/src/test/regress/sql/alter_generic.sql
++++ b/src/test/regress/sql/alter_generic.sql
+@@ -22,9 +22,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user3;
+ 
+ RESET client_min_messages;
+ 
+-CREATE USER regress_alter_generic_user3;
+-CREATE USER regress_alter_generic_user2;
+-CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3;
++CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3;
+ 
+ CREATE SCHEMA alt_nsp1;
+ CREATE SCHEMA alt_nsp2;
+@@ -316,7 +316,7 @@ DROP OPERATOR FAMILY alt_opf4 USING btree;
+ 
+ -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user5 NOSUPERUSER;
++CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER;
+ CREATE OPERATOR FAMILY alt_opf5 USING btree;
+ SET ROLE regress_alter_generic_user5;
+ ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2);
+@@ -326,7 +326,7 @@ ROLLBACK;
+ 
+ -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user6;
++CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA alt_nsp6;
+ REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6;
+ CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree;
+diff --git a/src/test/regress/sql/alter_operator.sql b/src/test/regress/sql/alter_operator.sql
+index fd40370165..ca8055e06d 100644
+--- a/src/test/regress/sql/alter_operator.sql
++++ b/src/test/regress/sql/alter_operator.sql
+@@ -87,7 +87,7 @@ ALTER OPERATOR & (bit, bit) SET ("Restrict" = _int_contsel, "Join" = _int_contjo
+ --
+ -- Test permission check. Must be owner to ALTER OPERATOR.
+ --
+-CREATE USER regress_alter_op_user;
++CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_alter_op_user;
+ 
+ ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE);
+diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
+index d2845abc97..a0719b8d0e 100644
+--- a/src/test/regress/sql/alter_table.sql
++++ b/src/test/regress/sql/alter_table.sql
+@@ -7,7 +7,7 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_alter_table_user1;
+ RESET client_min_messages;
+ 
+-CREATE USER regress_alter_table_user1;
++CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ --
+ -- add attribute
+@@ -2397,8 +2397,8 @@ DROP TABLE fail_part;
+ ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ 
+ -- check ownership of the source table
+-CREATE ROLE regress_test_me;
+-CREATE ROLE regress_test_not_me;
++CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE not_owned_by_me (LIKE list_parted);
+ ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+ SET SESSION AUTHORIZATION regress_test_me;
+diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
+index e414fa560d..79a75a0e57 100644
+--- a/src/test/regress/sql/arrays.sql
++++ b/src/test/regress/sql/arrays.sql
+@@ -22,7 +22,8 @@ CREATE TABLE array_op_test (
+ );
+ 
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_op_test FROM :'filename';
++\set command '\\copy array_op_test FROM ' :'filename';
++:command
+ ANALYZE array_op_test;
+ 
+ --
+diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
+index 239f4a4755..f29d87bdff 100644
+--- a/src/test/regress/sql/btree_index.sql
++++ b/src/test/regress/sql/btree_index.sql
+@@ -26,16 +26,20 @@ CREATE TABLE bt_f8_heap (
+ );
+ 
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_i4_heap FROM :'filename';
++\set command '\\copy bt_i4_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_name_heap FROM :'filename';
++\set command '\\copy bt_name_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_txt_heap FROM :'filename';
++\set command '\\copy bt_txt_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_f8_heap FROM :'filename';
++\set command '\\copy bt_f8_heap FROM ' :'filename';
++:command
+ 
+ ANALYZE bt_i4_heap;
+ ANALYZE bt_name_heap;
+diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql
+index 6cb9c926c0..5e689e4062 100644
+--- a/src/test/regress/sql/cluster.sql
++++ b/src/test/regress/sql/cluster.sql
+@@ -108,7 +108,7 @@ WHERE pg_class.oid=indexrelid
+ CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index;
+ 
+ -- Verify that clustering all tables does in fact cluster the right ones
+-CREATE USER regress_clstr_user;
++CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE clstr_1 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_2 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_3 (a INT PRIMARY KEY);
+@@ -233,7 +233,7 @@ DROP TABLE clstrpart;
+ CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i);
+ CREATE INDEX ptnowner_i_idx ON ptnowner(i);
+ CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1);
+-CREATE ROLE regress_ptnowner;
++CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2);
+ ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
+ ALTER TABLE ptnowner OWNER TO regress_ptnowner;
+diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
+index 3db9e25913..c66d5aa2c2 100644
+--- a/src/test/regress/sql/collate.icu.utf8.sql
++++ b/src/test/regress/sql/collate.icu.utf8.sql
+@@ -353,7 +353,7 @@ reset enable_seqscan;
+ 
+ -- schema manipulation commands
+ 
+-CREATE ROLE regress_test_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA test_schema;
+ 
+ -- We need to do this this way to cope with varying names for encodings:
+diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql
+index e3e3bea709..fa86ddc326 100644
+--- a/src/test/regress/sql/constraints.sql
++++ b/src/test/regress/sql/constraints.sql
+@@ -243,12 +243,14 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT,
+ 	CHECK (x > 3 AND y <> 'check failed' AND x < 7 ));
+ 
+ \set filename :abs_srcdir '/data/constro.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ 
+ SELECT * FROM COPY_TBL;
+ 
+ \set filename :abs_srcdir '/data/constrf.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ 
+ SELECT * FROM COPY_TBL;
+ 
+@@ -599,7 +601,7 @@ DROP TABLE deferred_excl;
+ 
+ -- Comments
+ -- Setup a low-level role to enforce non-superuser checks.
+-CREATE ROLE regress_constraint_comments;
++CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments;
+ 
+ CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0));
+@@ -621,7 +623,7 @@ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL;
+ 
+ -- unauthorized user
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_constraint_comments_noaccess;
++CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments_noaccess;
+ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
+ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment';
+diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
+index 9a65fca91f..58431a3056 100644
+--- a/src/test/regress/sql/conversion.sql
++++ b/src/test/regress/sql/conversion.sql
+@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
+     AS :'regresslib', 'test_enc_conversion'
+     LANGUAGE C STRICT;
+ 
+-CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
++CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_conversion_user;
+ CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
+ --
+diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql
+index 43d2e906dd..6c993d70f0 100644
+--- a/src/test/regress/sql/copy.sql
++++ b/src/test/regress/sql/copy.sql
+@@ -20,11 +20,13 @@ insert into copytest values('Mac',E'abc\rdef',3);
+ insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+ 
+ \set filename :abs_builddir '/results/copytest.csv'
+-copy copytest to :'filename' csv;
++\set command '\\copy copytest to ' :'filename' csv;
++:command
+ 
+ create temp table copytest2 (like copytest);
+ 
+-copy copytest2 from :'filename' csv;
++\set command '\\copy copytest2 from ' :'filename' csv;
++:command
+ 
+ select * from copytest except select * from copytest2;
+ 
+@@ -32,9 +34,11 @@ truncate copytest2;
+ 
+ --- same test but with an escape char different from quote char
+ 
+-copy copytest to :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ 
+-copy copytest2 from :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ 
+ select * from copytest except select * from copytest2;
+ 
+@@ -86,16 +90,19 @@ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x;
+ insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x;
+ 
+ \set filename :abs_builddir '/results/parted_copytest.csv'
+-copy (select * from parted_copytest order by a) to :'filename';
++\set command '\\copy (select * from parted_copytest order by a) to ' :'filename';
++:command
+ 
+ truncate parted_copytest;
+ 
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ 
+ -- Ensure COPY FREEZE errors for partitioned tables.
+ begin;
+ truncate parted_copytest;
+-copy parted_copytest from :'filename' (freeze);
++\set command '\\copy parted_copytest from ' :'filename' (freeze);
++:command
+ rollback;
+ 
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+@@ -115,7 +122,8 @@ create trigger part_ins_trig
+ 	for each row
+ 	execute procedure part_ins_func();
+ 
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ 
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+ group by tableoid order by tableoid::regclass::name;
+@@ -124,7 +132,8 @@ truncate table parted_copytest;
+ create index on parted_copytest (b);
+ drop trigger part_ins_trig on parted_copytest_a2;
+ 
+-copy parted_copytest from stdin;
++\set command '\\copy parted_copytest from ' stdin;
++:command
+ 1	1	str1
+ 2	2	str2
+ \.
+@@ -191,8 +200,8 @@ bill	20	(11,10)	1000	sharon
+ -- Generate COPY FROM report with FILE, with some excluded tuples.
+ truncate tab_progress_reporting;
+ \set filename :abs_srcdir '/data/emp.data'
+-copy tab_progress_reporting from :'filename'
+-	where (salary < 2000);
++\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)';
++:command
+ 
+ drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
+ drop function notice_after_tab_progress_reporting();
+@@ -311,7 +320,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1);
+ -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org
+ -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY parted_si(id, data) FROM :'filename';
++\set command '\\COPY parted_si(id, data) FROM ' :'filename';
++:command
+ 
+ -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
+ -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
+diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
+index d759635068..d58e50dcc5 100644
+--- a/src/test/regress/sql/copy2.sql
++++ b/src/test/regress/sql/copy2.sql
+@@ -365,8 +365,8 @@ copy check_con_tbl from stdin;
+ select * from check_con_tbl;
+ 
+ -- test with RLS enabled.
+-CREATE ROLE regress_rls_copy_user;
+-CREATE ROLE regress_rls_copy_user_colperms;
++CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ 
+ COPY rls_t1 (a, b, c) from stdin;
+diff --git a/src/test/regress/sql/create_function_sql.sql b/src/test/regress/sql/create_function_sql.sql
+index 89e9af3a49..2b86fe2285 100644
+--- a/src/test/regress/sql/create_function_sql.sql
++++ b/src/test/regress/sql/create_function_sql.sql
+@@ -6,7 +6,7 @@
+ 
+ -- All objects made in this test are in temp_func_test schema
+ 
+-CREATE USER regress_unpriv_user;
++CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE SCHEMA temp_func_test;
+ GRANT ALL ON SCHEMA temp_func_test TO public;
+diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
+index d49ce9f300..47fa813bc8 100644
+--- a/src/test/regress/sql/create_index.sql
++++ b/src/test/regress/sql/create_index.sql
+@@ -71,7 +71,8 @@ CREATE TABLE fast_emp4000 (
+ );
+ 
+ \set filename :abs_srcdir '/data/rect.data'
+-COPY slow_emp4000 FROM :'filename';
++\set command '\\copy slow_emp4000 FROM ' :'filename';
++:command
+ 
+ INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000;
+ 
+@@ -269,7 +270,8 @@ CREATE TABLE array_index_op_test (
+ );
+ 
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_index_op_test FROM :'filename';
++\set command '\\copy array_index_op_test FROM ' :'filename';
++:command
+ ANALYZE array_index_op_test;
+ 
+ SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno;
+@@ -1246,7 +1248,7 @@ END;
+ REINDEX SCHEMA CONCURRENTLY schema_to_reindex;
+ 
+ -- Failure for unauthorized user
+-CREATE ROLE regress_reindexuser NOLOGIN;
++CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_reindexuser;
+ REINDEX SCHEMA schema_to_reindex;
+ -- Permission failures with toast tables and indexes (pg_authid here)
+diff --git a/src/test/regress/sql/create_procedure.sql b/src/test/regress/sql/create_procedure.sql
+index 069a3727ce..faeeb3f744 100644
+--- a/src/test/regress/sql/create_procedure.sql
++++ b/src/test/regress/sql/create_procedure.sql
+@@ -255,7 +255,7 @@ DROP PROCEDURE nonexistent();
+ 
+ -- privileges
+ 
+-CREATE USER regress_cp_user1;
++CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT INSERT ON cp_test TO regress_cp_user1;
+ REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC;
+ SET ROLE regress_cp_user1;
+diff --git a/src/test/regress/sql/create_role.sql b/src/test/regress/sql/create_role.sql
+index 4491a28a8a..3045434865 100644
+--- a/src/test/regress/sql/create_role.sql
++++ b/src/test/regress/sql/create_role.sql
+@@ -1,20 +1,20 @@
+ -- ok, superuser can create users with any set of privileges
+-CREATE ROLE regress_role_super SUPERUSER;
+-CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS;
++CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION;
+-CREATE ROLE regress_role_limited_admin CREATEROLE;
+-CREATE ROLE regress_role_normal;
++CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, CREATEROLE user can't give away role attributes without having them
+ SET SESSION AUTHORIZATION regress_role_limited_admin;
+-CREATE ROLE regress_nosuch_superuser SUPERUSER;
+-CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_nosuch_replication REPLICATION;
+-CREATE ROLE regress_nosuch_bypassrls BYPASSRLS;
+-CREATE ROLE regress_nosuch_createdb CREATEDB;
++CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can create a role without any special attributes
+-CREATE ROLE regress_role_limited;
++CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, can't give it in any of the restricted attributes
+ ALTER ROLE regress_role_limited SUPERUSER;
+@@ -25,10 +25,10 @@ DROP ROLE regress_role_limited;
+ 
+ -- ok, can give away these role attributes if you have them
+ SET SESSION AUTHORIZATION regress_role_admin;
+-CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_replication REPLICATION;
+-CREATE ROLE regress_bypassrls BYPASSRLS;
+-CREATE ROLE regress_createdb CREATEDB;
++CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can toggle these role attributes off and on if you have them
+ ALTER ROLE regress_replication NOREPLICATION;
+@@ -43,52 +43,52 @@ ALTER ROLE regress_createdb SUPERUSER;
+ ALTER ROLE regress_createdb NOSUPERUSER;
+ 
+ -- ok, having CREATEROLE is enough to create users with these privileges
+-CREATE ROLE regress_createrole CREATEROLE NOINHERIT;
++CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION;
+-CREATE ROLE regress_login LOGIN;
+-CREATE ROLE regress_inherit INHERIT;
+-CREATE ROLE regress_connection_limit CONNECTION LIMIT 5;
+-CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo';
+-CREATE ROLE regress_password_null PASSWORD NULL;
++CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, backwards compatible noise words should be ignored
+-CREATE ROLE regress_noiseword SYSID 12345;
++CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant membership in superuser role
+-CREATE ROLE regress_nosuch_super IN ROLE regress_role_super;
++CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, database owner cannot have members
+-CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner;
++CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can grant other users into a role
+ CREATE ROLE regress_inroles ROLE
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant a role into itself
+-CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive;
++CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can grant other users into a role with admin option
+ CREATE ROLE regress_adminroles ADMIN
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant a role into itself with admin option
+-CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive;
++CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, regress_createrole does not have CREATEDB privilege
+ SET SESSION AUTHORIZATION regress_createrole;
+ CREATE DATABASE regress_nosuch_db;
+ 
+ -- ok, regress_createrole can create new roles
+-CREATE ROLE regress_plainrole;
++CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, roles with CREATEROLE can create new roles with it
+-CREATE ROLE regress_rolecreator CREATEROLE;
++CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, roles with CREATEROLE can create new roles with different role
+ -- attributes, including CREATEROLE
+-CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5;
++CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, we should be able to modify a role we created
+ COMMENT ON ROLE regress_hasprivs IS 'some comment';
+@@ -123,7 +123,7 @@ REASSIGN OWNED BY regress_tenant TO regress_createrole;
+ 
+ -- ok, create a role with a value for createrole_self_grant
+ SET createrole_self_grant = 'set, inherit';
+-CREATE ROLE regress_tenant2;
++CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_tenant2;
+ 
+ -- ok, regress_tenant2 can create objects within the database
+@@ -150,16 +150,16 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2;
+ DROP TABLE tenant2_table;
+ 
+ -- fail, CREATEROLE is not enough to create roles in privileged roles
+-CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data;
+-CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data;
+-CREATE ROLE regress_monitor IN ROLE pg_monitor;
+-CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings;
+-CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats;
+-CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables;
+-CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files;
+-CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files;
+-CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program;
+-CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend;
++CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data;
++CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data;
++CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor;
++CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings;
++CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats;
++CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables;
++CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files;
++CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files;
++CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program;
++CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend;
+ 
+ -- fail, role still owns database objects
+ DROP ROLE regress_tenant;
+diff --git a/src/test/regress/sql/create_schema.sql b/src/test/regress/sql/create_schema.sql
+index 1b7064247a..be5b662ce1 100644
+--- a/src/test/regress/sql/create_schema.sql
++++ b/src/test/regress/sql/create_schema.sql
+@@ -4,7 +4,7 @@
+ 
+ -- Schema creation with elements.
+ 
+-CREATE ROLE regress_create_schema_role SUPERUSER;
++CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Cases where schema creation fails as objects are qualified with a schema
+ -- that does not match with what's expected.
+diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql
+index 3a78be1b0c..617d2dc8d6 100644
+--- a/src/test/regress/sql/create_view.sql
++++ b/src/test/regress/sql/create_view.sql
+@@ -23,7 +23,8 @@ CREATE TABLE real_city (
+ );
+ 
+ \set filename :abs_srcdir '/data/real_city.data'
+-COPY real_city FROM :'filename';
++\set command '\\copy real_city FROM ' :'filename';
++:command
+ ANALYZE real_city;
+ 
+ SELECT *
+diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql
+index 0367c0e37a..a23b98c4bd 100644
+--- a/src/test/regress/sql/database.sql
++++ b/src/test/regress/sql/database.sql
+@@ -1,8 +1,6 @@
+ CREATE DATABASE regression_tbd
+ 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
+ ALTER DATABASE regression_tbd RENAME TO regression_utf8;
+-ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
+-ALTER DATABASE regression_utf8 RESET TABLESPACE;
+ ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
+ 
+ -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
+diff --git a/src/test/regress/sql/dependency.sql b/src/test/regress/sql/dependency.sql
+index 2559c62d0b..06c3aa1a36 100644
+--- a/src/test/regress/sql/dependency.sql
++++ b/src/test/regress/sql/dependency.sql
+@@ -2,10 +2,10 @@
+ -- DEPENDENCIES
+ --
+ 
+-CREATE USER regress_dep_user;
+-CREATE USER regress_dep_user2;
+-CREATE USER regress_dep_user3;
+-CREATE GROUP regress_dep_group;
++CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE deptest (f1 serial primary key, f2 text);
+ 
+@@ -45,9 +45,9 @@ DROP TABLE deptest;
+ DROP USER regress_dep_user3;
+ 
+ -- Test DROP OWNED
+-CREATE USER regress_dep_user0;
+-CREATE USER regress_dep_user1;
+-CREATE USER regress_dep_user2;
++CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_dep_user0;
+ -- permission denied
+ DROP OWNED BY regress_dep_user1;
+diff --git a/src/test/regress/sql/drop_if_exists.sql b/src/test/regress/sql/drop_if_exists.sql
+index ac6168b91f..4270062ec7 100644
+--- a/src/test/regress/sql/drop_if_exists.sql
++++ b/src/test/regress/sql/drop_if_exists.sql
+@@ -86,9 +86,9 @@ DROP DOMAIN test_domain_exists;
+ --- role/user/group
+ ---
+ 
+-CREATE USER regress_test_u1;
+-CREATE ROLE regress_test_r1;
+-CREATE GROUP regress_test_g1;
++CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ DROP USER regress_test_u2;
+ 
+diff --git a/src/test/regress/sql/equivclass.sql b/src/test/regress/sql/equivclass.sql
+index 247b0a3105..bf018fd3a1 100644
+--- a/src/test/regress/sql/equivclass.sql
++++ b/src/test/regress/sql/equivclass.sql
+@@ -230,7 +230,7 @@ set enable_mergejoin = off;
+ alter table ec1 enable row level security;
+ create policy p1 on ec1 using (f1 < '5'::int8alias1);
+ 
+-create user regress_user_ectest;
++create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select on ec0 to regress_user_ectest;
+ grant select on ec1 to regress_user_ectest;
+ 
+diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql
+index 1aeaddbe71..89a410ec4a 100644
+--- a/src/test/regress/sql/event_trigger.sql
++++ b/src/test/regress/sql/event_trigger.sql
+@@ -86,7 +86,7 @@ create event trigger regress_event_trigger2 on ddl_command_start
+ comment on event trigger regress_event_trigger is 'test comment';
+ 
+ -- drop as non-superuser should fail
+-create role regress_evt_user;
++create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_evt_user;
+ create event trigger regress_event_trigger_noperms on ddl_command_start
+    execute procedure test_event_trigger();
+diff --git a/src/test/regress/sql/foreign_data.sql b/src/test/regress/sql/foreign_data.sql
+index aa147b14a9..370e0dd570 100644
+--- a/src/test/regress/sql/foreign_data.sql
++++ b/src/test/regress/sql/foreign_data.sql
+@@ -22,14 +22,14 @@ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_r
+ 
+ RESET client_min_messages;
+ 
+-CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER;
++CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_foreign_data_user';
+ 
+-CREATE ROLE regress_test_role;
+-CREATE ROLE regress_test_role2;
+-CREATE ROLE regress_test_role_super SUPERUSER;
+-CREATE ROLE regress_test_indirect;
+-CREATE ROLE regress_unprivileged_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE FOREIGN DATA WRAPPER dummy;
+ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
+diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
+index 22e177f89b..7138d5e1d4 100644
+--- a/src/test/regress/sql/foreign_key.sql
++++ b/src/test/regress/sql/foreign_key.sql
+@@ -1418,7 +1418,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
+ -- leave these tables around intentionally
+ 
+ -- test the case when the referenced table is owned by a different user
+-create role regress_other_partitioned_fk_owner;
++create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner;
+ set role regress_other_partitioned_fk_owner;
+ create table other_partitioned_fk(a int, b int) partition by list (a);
+diff --git a/src/test/regress/sql/generated.sql b/src/test/regress/sql/generated.sql
+index 298f6b3aa8..f058913ae0 100644
+--- a/src/test/regress/sql/generated.sql
++++ b/src/test/regress/sql/generated.sql
+@@ -263,7 +263,7 @@ ALTER TABLE gtest10a DROP COLUMN b;
+ INSERT INTO gtest10a (a) VALUES (1);
+ 
+ -- privileges
+-CREATE USER regress_user11;
++CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED);
+ INSERT INTO gtest11s VALUES (1, 10), (2, 20);
+diff --git a/src/test/regress/sql/guc.sql b/src/test/regress/sql/guc.sql
+index dc79761955..a9ead75349 100644
+--- a/src/test/regress/sql/guc.sql
++++ b/src/test/regress/sql/guc.sql
+@@ -188,7 +188,7 @@ PREPARE foo AS SELECT 1;
+ LISTEN foo_event;
+ SET vacuum_cost_delay = 13;
+ CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS;
+-CREATE ROLE regress_guc_user;
++CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_guc_user;
+ -- look changes
+ SELECT pg_listening_channels();
+diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql
+index 527024f710..de49c0b85f 100644
+--- a/src/test/regress/sql/hash_index.sql
++++ b/src/test/regress/sql/hash_index.sql
+@@ -26,10 +26,14 @@ CREATE TABLE hash_f8_heap (
+ );
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY hash_i4_heap FROM :'filename';
+-COPY hash_name_heap FROM :'filename';
+-COPY hash_txt_heap FROM :'filename';
+-COPY hash_f8_heap FROM :'filename';
++\set command '\\copy hash_i4_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_name_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_txt_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_f8_heap FROM ' :'filename';
++:command
+ 
+ -- the data in this file has a lot of duplicates in the index key
+ -- fields, leading to long bucket chains and lots of table expansion.
+diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql
+index 91d2e443b4..241c93f373 100644
+--- a/src/test/regress/sql/identity.sql
++++ b/src/test/regress/sql/identity.sql
+@@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART;
+ ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY;
+ 
+ -- privileges
+-CREATE USER regress_identity_user1;
++CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
+ SET ROLE regress_identity_user1;
+diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
+index fe699c54d5..bdd5993f45 100644
+--- a/src/test/regress/sql/inherit.sql
++++ b/src/test/regress/sql/inherit.sql
+@@ -950,7 +950,7 @@ create index on permtest_parent (left(c, 3));
+ insert into permtest_parent
+   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
+ analyze permtest_parent;
+-create role regress_no_child_access;
++create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ revoke all on permtest_grandchild from regress_no_child_access;
+ grant select on permtest_parent to regress_no_child_access;
+ set session authorization regress_no_child_access;
+diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
+index 2b086eeb6d..913d8a0aed 100644
+--- a/src/test/regress/sql/insert.sql
++++ b/src/test/regress/sql/insert.sql
+@@ -513,7 +513,7 @@ drop table mlparted5;
+ create table key_desc (a int, b int) partition by list ((a+0));
+ create table key_desc_1 partition of key_desc for values in (1) partition by range (b);
+ 
+-create user regress_insert_other_user;
++create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select (a) on key_desc_1 to regress_insert_other_user;
+ grant insert on key_desc to regress_insert_other_user;
+ 
+@@ -597,7 +597,7 @@ insert into brtrigpartcon1 values (1, 'hi there');
+ -- check that the message shows the appropriate column description in a
+ -- situation where the partitioned table is not the primary ModifyTable node
+ create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int);
+-create role regress_coldesc_role;
++create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant insert on inserttest3 to regress_coldesc_role;
+ grant insert on brtrigpartcon to regress_coldesc_role;
+ revoke select on brtrigpartcon from regress_coldesc_role;
+diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
+index 6dae715afd..aa320ba7be 100644
+--- a/src/test/regress/sql/jsonb.sql
++++ b/src/test/regress/sql/jsonb.sql
+@@ -6,7 +6,8 @@ CREATE TABLE testjsonb (
+ );
+ 
+ \set filename :abs_srcdir '/data/jsonb.data'
+-COPY testjsonb FROM :'filename';
++\set command '\\copy testjsonb FROM ' :'filename';
++:command
+ 
+ -- Strings.
+ SELECT '""'::jsonb;				-- OK.
+diff --git a/src/test/regress/sql/largeobject.sql b/src/test/regress/sql/largeobject.sql
+index a4aee02e3a..8839c9496a 100644
+--- a/src/test/regress/sql/largeobject.sql
++++ b/src/test/regress/sql/largeobject.sql
+@@ -10,7 +10,7 @@
+ SET bytea_output TO escape;
+ 
+ -- Test ALTER LARGE OBJECT OWNER
+-CREATE ROLE regress_lo_user;
++CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT lo_create(42);
+ ALTER LARGE OBJECT 42 OWNER TO regress_lo_user;
+ 
+@@ -189,7 +189,8 @@ SELECT lo_unlink(loid) from lotest_stash_values;
+ TRUNCATE lotest_stash_values;
+ 
+ \set filename :abs_srcdir '/data/tenk.data'
+-INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename');
++\lo_import :filename
++INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID);
+ 
+ BEGIN;
+ UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer));
+@@ -219,8 +220,8 @@ SELECT lo_close(fd) FROM lotest_stash_values;
+ END;
+ 
+ \set filename :abs_builddir '/results/lotest.txt'
+-SELECT lo_export(loid, :'filename') FROM lotest_stash_values;
+-
++SELECT loid FROM lotest_stash_values \gset
++\lo_export :loid, :filename
+ \lo_import :filename
+ 
+ \set newloid :LASTOID
+diff --git a/src/test/regress/sql/lock.sql b/src/test/regress/sql/lock.sql
+index b88488c6d0..78b31e6dd3 100644
+--- a/src/test/regress/sql/lock.sql
++++ b/src/test/regress/sql/lock.sql
+@@ -19,7 +19,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2;
+ CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1;
+ CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a);
+ CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub;
+-CREATE ROLE regress_rol_lock1;
++CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1;
+ GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1;
+ 
+diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql
+index 235123de1e..58e73cec5d 100644
+--- a/src/test/regress/sql/matview.sql
++++ b/src/test/regress/sql/matview.sql
+@@ -209,7 +209,7 @@ SELECT * FROM mvtest_mv_v;
+ DROP TABLE mvtest_v CASCADE;
+ 
+ -- make sure running as superuser works when MV owned by another role (bug #11208)
+-CREATE ROLE regress_user_mvtest;
++CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_user_mvtest;
+ -- this test case also checks for ambiguity in the queries issued by
+ -- refresh_by_match_merge(), by choosing column names that intentionally
+@@ -264,7 +264,7 @@ ROLLBACK;
+ 
+ -- INSERT privileges if relation owner is not allowed to insert.
+ CREATE SCHEMA matview_schema;
+-CREATE USER regress_matview_user;
++CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user
+   REVOKE INSERT ON TABLES FROM regress_matview_user;
+ GRANT ALL ON SCHEMA matview_schema TO public;
+diff --git a/src/test/regress/sql/merge.sql b/src/test/regress/sql/merge.sql
+index 2a220a248f..91a404d51e 100644
+--- a/src/test/regress/sql/merge.sql
++++ b/src/test/regress/sql/merge.sql
+@@ -2,9 +2,9 @@
+ -- MERGE
+ --
+ 
+-CREATE USER regress_merge_privs;
+-CREATE USER regress_merge_no_privs;
+-CREATE USER regress_merge_none;
++CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ DROP TABLE IF EXISTS target;
+ DROP TABLE IF EXISTS source;
+diff --git a/src/test/regress/sql/misc.sql b/src/test/regress/sql/misc.sql
+index 165a2e175f..08d7096e2c 100644
+--- a/src/test/regress/sql/misc.sql
++++ b/src/test/regress/sql/misc.sql
+@@ -74,22 +74,26 @@ DROP TABLE tmp;
+ -- copy
+ --
+ \set filename :abs_builddir '/results/onek.data'
+-COPY onek TO :'filename';
++\set command '\\copy onek TO ' :'filename';
++:command
+ 
+ CREATE TEMP TABLE onek_copy (LIKE onek);
+ 
+-COPY onek_copy FROM :'filename';
++\set command '\\copy onek_copy FROM ' :'filename';
++:command
+ 
+ SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy;
+ 
+ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek;
+ 
+ \set filename :abs_builddir '/results/stud_emp.data'
+-COPY BINARY stud_emp TO :'filename';
++\set command '\\COPY BINARY stud_emp TO ' :'filename';
++:command
+ 
+ CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp);
+ 
+-COPY BINARY stud_emp_copy FROM :'filename';
++\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename';
++:command
+ 
+ SELECT * FROM stud_emp_copy;
+ 
+diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql
+index b57f01f3e9..3e05aa6400 100644
+--- a/src/test/regress/sql/misc_functions.sql
++++ b/src/test/regress/sql/misc_functions.sql
+@@ -82,7 +82,7 @@ SELECT pg_log_backend_memory_contexts(pg_backend_pid());
+ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity
+   WHERE backend_type = 'checkpointer';
+ 
+-CREATE ROLE regress_log_memory;
++CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SELECT has_function_privilege('regress_log_memory',
+   'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no
+@@ -169,7 +169,7 @@ select count(*) > 0 from
+ --
+ -- Test replication slot directory functions
+ --
+-CREATE ROLE regress_slot_dir_funcs;
++CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Not available by default.
+ SELECT has_function_privilege('regress_slot_dir_funcs',
+   'pg_ls_logicalsnapdir()', 'EXECUTE');
+diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql
+index 1a6c61f49d..1c31ac6a53 100644
+--- a/src/test/regress/sql/object_address.sql
++++ b/src/test/regress/sql/object_address.sql
+@@ -7,7 +7,7 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_addr_user;
+ RESET client_min_messages;
+ 
+-CREATE USER regress_addr_user;
++CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Test generic object addressing/identification functions
+ CREATE SCHEMA addr_nsp;
+diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql
+index 53e86b0b6c..f07cf1ec54 100644
+--- a/src/test/regress/sql/password.sql
++++ b/src/test/regress/sql/password.sql
+@@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok
+ 
+ -- consistency of password entries
+ SET password_encryption = 'md5';
+-CREATE ROLE regress_passwd1 PASSWORD 'role_pwd1';
+-CREATE ROLE regress_passwd2 PASSWORD 'role_pwd2';
++CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET password_encryption = 'scram-sha-256';
+-CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
+-CREATE ROLE regress_passwd4 PASSWORD NULL;
++CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- check list of created entries
+ --
+@@ -42,14 +42,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+ SET password_encryption = 'md5';
+ 
+ -- encrypt with MD5
+-ALTER ROLE regress_passwd2 PASSWORD 'foo';
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted, use as they are
+ ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+ ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
+ 
+ SET password_encryption = 'scram-sha-256';
+ -- create SCRAM secret
+-ALTER ROLE  regress_passwd4 PASSWORD 'foo';
++ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted with MD5, use as it is
+ CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+ 
+diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
+index 3f68cafcd1..004b26831d 100644
+--- a/src/test/regress/sql/privileges.sql
++++ b/src/test/regress/sql/privileges.sql
+@@ -24,18 +24,18 @@ RESET client_min_messages;
+ 
+ -- test proper begins here
+ 
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
+-CREATE USER regress_priv_user5;	-- duplicate
+-CREATE USER regress_priv_user6;
+-CREATE USER regress_priv_user7;
+-CREATE USER regress_priv_user8;
+-CREATE USER regress_priv_user9;
+-CREATE USER regress_priv_user10;
+-CREATE ROLE regress_priv_role;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;	-- duplicate
++CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- circular ADMIN OPTION grants should be disallowed
+ GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION;
+@@ -84,11 +84,11 @@ DROP ROLE regress_priv_user5; -- should fail, dependency
+ DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order
+ 
+ -- recreate the roles we just dropped
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT pg_read_all_data TO regress_priv_user6;
+ GRANT pg_write_all_data TO regress_priv_user7;
+@@ -130,8 +130,8 @@ DROP USER regress_priv_user10;
+ DROP USER regress_priv_user9;
+ DROP USER regress_priv_user8;
+ 
+-CREATE GROUP regress_priv_group1;
+-CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2;
++CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
+ 
+ ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
+ 
+@@ -1124,7 +1124,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+ 
+ -- security-restricted operations
+ \c -
+-CREATE ROLE regress_sro_user;
++CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Check that index expressions and predicates are run as the table's owner
+ 
+@@ -1620,8 +1620,8 @@ DROP SCHEMA testns CASCADE;
+ -- Change owner of the schema & and rename of new schema owner
+ \c -
+ 
+-CREATE ROLE regress_schemauser1 superuser login;
+-CREATE ROLE regress_schemauser2 superuser login;
++CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SET SESSION ROLE regress_schemauser1;
+ CREATE SCHEMA testns;
+@@ -1715,7 +1715,7 @@ DROP USER regress_priv_user8; -- does not exist
+ 
+ 
+ -- permissions with LOCK TABLE
+-CREATE USER regress_locktable_user;
++CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE lock_table (a int);
+ 
+ -- LOCK TABLE and SELECT permission
+@@ -1803,7 +1803,7 @@ DROP USER regress_locktable_user;
+ -- switch to superuser
+ \c -
+ 
+-CREATE ROLE regress_readallstats;
++CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
+ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+@@ -1823,10 +1823,10 @@ RESET ROLE;
+ DROP ROLE regress_readallstats;
+ 
+ -- test role grantor machinery
+-CREATE ROLE regress_group;
+-CREATE ROLE regress_group_direct_manager;
+-CREATE ROLE regress_group_indirect_manager;
+-CREATE ROLE regress_group_member;
++CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
+ GRANT regress_group_direct_manager TO regress_group_indirect_manager;
+@@ -1848,9 +1848,9 @@ DROP ROLE regress_group_indirect_manager;
+ DROP ROLE regress_group_member;
+ 
+ -- test SET and INHERIT options with object ownership changes
+-CREATE ROLE regress_roleoption_protagonist;
+-CREATE ROLE regress_roleoption_donor;
+-CREATE ROLE regress_roleoption_recipient;
++CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA regress_roleoption;
+ GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
+ GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
+diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql
+index f3bc6cd07e..f1a2f58069 100644
+--- a/src/test/regress/sql/psql.sql
++++ b/src/test/regress/sql/psql.sql
+@@ -496,7 +496,7 @@ select 1 where false;
+ \pset expanded off
+ 
+ CREATE SCHEMA tableam_display;
+-CREATE ROLE regress_display_role;
++CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER SCHEMA tableam_display OWNER TO regress_display_role;
+ SET search_path TO tableam_display;
+ CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler;
+@@ -1174,7 +1174,7 @@ select 1/(15-unique2) from tenk1 order by unique2 limit 19;
+ \unset FETCH_COUNT
+ 
+ create schema testpart;
+-create role regress_partitioning_role;
++create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ alter schema testpart owner to regress_partitioning_role;
+ 
+@@ -1285,7 +1285,7 @@ reset work_mem;
+ 
+ -- check \df+
+ -- we have to use functions with a predictable owner name, so make a role
+-create role regress_psql_user superuser;
++create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ begin;
+ set session authorization regress_psql_user;
+ 
+@@ -1431,11 +1431,14 @@ CREATE TEMPORARY TABLE reload_output(
+ );
+ 
+ SELECT 1 AS a \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ 
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+@@ -1452,17 +1455,20 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c;
+ -- COPY TO file
+ -- The data goes to :g_out_file and the status to :o_out_file
+ \set QUIET false
+-COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file';
++\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file';
++:command
+ -- DML command status
+ UPDATE onek SET unique1 = unique1 WHERE false;
+ \set QUIET true
+ \o
+ 
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+ 
+@@ -1475,10 +1481,12 @@ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file
+ \o
+ 
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ 
+ DROP TABLE reload_output;
+@@ -1825,10 +1833,10 @@ DROP FUNCTION psql_error;
+ \dX "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ 
+ -- check \drg and \du
+-CREATE ROLE regress_du_role0;
+-CREATE ROLE regress_du_role1;
+-CREATE ROLE regress_du_role2;
+-CREATE ROLE regress_du_admin;
++CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE;
+diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql
+index d5051a5e74..b32d729271 100644
+--- a/src/test/regress/sql/publication.sql
++++ b/src/test/regress/sql/publication.sql
+@@ -1,9 +1,9 @@
+ --
+ -- PUBLICATION
+ --
+-CREATE ROLE regress_publication_user LOGIN SUPERUSER;
+-CREATE ROLE regress_publication_user2;
+-CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_publication_user';
+ 
+ -- suppress warning that depends on wal_level
+@@ -801,7 +801,7 @@ DROP PUBLICATION testpub2;
+ DROP PUBLICATION testpub3;
+ 
+ SET ROLE regress_publication_user;
+-CREATE ROLE regress_publication_user3;
++CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_publication_user2 TO regress_publication_user3;
+ SET client_min_messages = 'ERROR';
+ CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test;
+diff --git a/src/test/regress/sql/regproc.sql b/src/test/regress/sql/regproc.sql
+index de2aa881a8..41a675fd35 100644
+--- a/src/test/regress/sql/regproc.sql
++++ b/src/test/regress/sql/regproc.sql
+@@ -4,7 +4,7 @@
+ 
+ /* If objects exist, return oids */
+ 
+-CREATE ROLE regress_regrole_test;
++CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- without schemaname
+ 
+diff --git a/src/test/regress/sql/roleattributes.sql b/src/test/regress/sql/roleattributes.sql
+index c961b2d730..0859b89c4f 100644
+--- a/src/test/regress/sql/roleattributes.sql
++++ b/src/test/regress/sql/roleattributes.sql
+@@ -1,83 +1,83 @@
+ -- default for superuser is false
+-CREATE ROLE regress_test_def_superuser;
++CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
+-CREATE ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
++CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ ALTER ROLE regress_test_superuser WITH NOSUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ ALTER ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ 
+ -- default for inherit is true
+-CREATE ROLE regress_test_def_inherit;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
+-CREATE ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
++CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ ALTER ROLE regress_test_inherit WITH INHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ ALTER ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ 
+ -- default for create role is false
+-CREATE ROLE regress_test_def_createrole;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
+-CREATE ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
++CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ ALTER ROLE regress_test_createrole WITH NOCREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ ALTER ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ 
+ -- default for create database is false
+-CREATE ROLE regress_test_def_createdb;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
+-CREATE ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
++CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ ALTER ROLE regress_test_createdb WITH NOCREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ ALTER ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ 
+ -- default for can login is false for role
+-CREATE ROLE regress_test_def_role_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
+-CREATE ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
++CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ ALTER ROLE regress_test_role_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ ALTER ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ 
+ -- default for can login is true for user
+-CREATE USER regress_test_def_user_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
+-CREATE USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
++CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ ALTER USER regress_test_user_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ ALTER USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ 
+ -- default for replication is false
+-CREATE ROLE regress_test_def_replication;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
+-CREATE ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
++CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ ALTER ROLE regress_test_replication WITH NOREPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ ALTER ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ 
+ -- default for bypassrls is false
+-CREATE ROLE regress_test_def_bypassrls;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
+-CREATE ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
++CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ ALTER ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ 
+ -- clean up roles
+ DROP ROLE regress_test_def_superuser;
+diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
+index dec7340538..cdbc03a5cc 100644
+--- a/src/test/regress/sql/rowsecurity.sql
++++ b/src/test/regress/sql/rowsecurity.sql
+@@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
+ RESET client_min_messages;
+ 
+ -- initial setup
+-CREATE USER regress_rls_alice NOLOGIN;
+-CREATE USER regress_rls_bob NOLOGIN;
+-CREATE USER regress_rls_carol NOLOGIN;
+-CREATE USER regress_rls_dave NOLOGIN;
+-CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN;
+-CREATE ROLE regress_rls_group1 NOLOGIN;
+-CREATE ROLE regress_rls_group2 NOLOGIN;
++CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_rls_group1 TO regress_rls_bob;
+ GRANT regress_rls_group2 TO regress_rls_carol;
+@@ -2065,8 +2065,8 @@ SELECT count(*) = 0 FROM pg_depend
+ -- DROP OWNED BY testing
+ RESET SESSION AUTHORIZATION;
+ 
+-CREATE ROLE regress_rls_dob_role1;
+-CREATE ROLE regress_rls_dob_role2;
++CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE dob_t1 (c1 int);
+ CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1);
+diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql
+index 8b7e255dcd..c58d095c05 100644
+--- a/src/test/regress/sql/rules.sql
++++ b/src/test/regress/sql/rules.sql
+@@ -1356,7 +1356,7 @@ DROP TABLE ruletest2;
+ -- Test non-SELECT rule on security invoker view.
+ -- Should use view owner's permissions.
+ --
+-CREATE USER regress_rule_user1;
++CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE ruletest_t1 (x int);
+ CREATE TABLE ruletest_t2 (x int);
+diff --git a/src/test/regress/sql/security_label.sql b/src/test/regress/sql/security_label.sql
+index 98e6a5f211..68c868fef2 100644
+--- a/src/test/regress/sql/security_label.sql
++++ b/src/test/regress/sql/security_label.sql
+@@ -10,8 +10,8 @@ DROP ROLE IF EXISTS regress_seclabel_user2;
+ 
+ RESET client_min_messages;
+ 
+-CREATE USER regress_seclabel_user1 WITH CREATEROLE;
+-CREATE USER regress_seclabel_user2;
++CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE seclabel_tbl1 (a int, b text);
+ CREATE TABLE seclabel_tbl2 (x int, y text);
+diff --git a/src/test/regress/sql/select_into.sql b/src/test/regress/sql/select_into.sql
+index 689c448cc2..223ceb1d75 100644
+--- a/src/test/regress/sql/select_into.sql
++++ b/src/test/regress/sql/select_into.sql
+@@ -20,7 +20,7 @@ DROP TABLE sitmp1;
+ -- SELECT INTO and INSERT permission, if owner is not allowed to insert.
+ --
+ CREATE SCHEMA selinto_schema;
+-CREATE USER regress_selinto_user;
++CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
+ 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
+ GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql
+index e742f13699..7bd0255df8 100644
+--- a/src/test/regress/sql/select_views.sql
++++ b/src/test/regress/sql/select_views.sql
+@@ -12,7 +12,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
+ --
+ -- Test for Leaky view scenario
+ --
+-CREATE ROLE regress_alice;
++CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE FUNCTION f_leak (text)
+        RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql
+index 793f1415f6..ec07c1f193 100644
+--- a/src/test/regress/sql/sequence.sql
++++ b/src/test/regress/sql/sequence.sql
+@@ -293,7 +293,7 @@ ROLLBACK;
+ 
+ -- privileges tests
+ 
+-CREATE USER regress_seq_user;
++CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- nextval
+ BEGIN;
+diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql
+index 1e21e55c6d..2251f50c5e 100644
+--- a/src/test/regress/sql/stats.sql
++++ b/src/test/regress/sql/stats.sql
+@@ -622,23 +622,6 @@ SELECT :io_sum_shared_after_writes > :io_sum_shared_before_writes;
+ SELECT current_setting('fsync') = 'off'
+   OR :io_sum_shared_after_fsyncs > :io_sum_shared_before_fsyncs;
+ 
+--- Change the tablespace so that the table is rewritten directly, then SELECT
+--- from it to cause it to be read back into shared buffers.
+-SELECT sum(reads) AS io_sum_shared_before_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+--- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly
+--- rewritten table, e.g. by autovacuum.
+-BEGIN;
+-ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace;
+--- SELECT from the table so that the data is read into shared buffers and
+--- context 'normal', object 'relation' reads are counted.
+-SELECT COUNT(*) FROM test_io_shared;
+-COMMIT;
+-SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS io_sum_shared_after_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation'  \gset
+-SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
+-
+ SELECT sum(hits) AS io_sum_shared_before_hits
+   FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+ -- Select from the table again to count hits.
+diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
+index 1b80d3687b..4d8798b0b1 100644
+--- a/src/test/regress/sql/stats_ext.sql
++++ b/src/test/regress/sql/stats_ext.sql
+@@ -50,7 +50,7 @@ DROP TABLE ext_stats_test;
+ CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
+ CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment';
+-CREATE ROLE regress_stats_ext;
++CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_stats_ext;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment';
+ DROP STATISTICS ab1_a_b_stats;
+@@ -1607,7 +1607,7 @@ drop statistics stts_t1_expr_expr_stat;
+ set search_path to public, stts_s1;
+ \dX
+ 
+-create role regress_stats_ext nosuperuser;
++create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_stats_ext;
+ \dX
+ reset role;
+@@ -1618,7 +1618,7 @@ drop user regress_stats_ext;
+ reset search_path;
+ 
+ -- User with no access
+-CREATE USER regress_stats_user1;
++CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT USAGE ON SCHEMA tststats TO regress_stats_user1;
+ SET SESSION AUTHORIZATION regress_stats_user1;
+ SELECT * FROM tststats.priv_test_tbl; -- Permission denied
+diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql
+index 444e563ff3..1a538a98a0 100644
+--- a/src/test/regress/sql/subscription.sql
++++ b/src/test/regress/sql/subscription.sql
+@@ -2,10 +2,10 @@
+ -- SUBSCRIPTION
+ --
+ 
+-CREATE ROLE regress_subscription_user LOGIN SUPERUSER;
+-CREATE ROLE regress_subscription_user2;
+-CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription;
+-CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription;
++CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_subscription_user';
+ 
+ -- fail - no publications
+diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql
+index 1b2d434683..b765c748b8 100644
+--- a/src/test/regress/sql/test_setup.sql
++++ b/src/test/regress/sql/test_setup.sql
+@@ -135,7 +135,8 @@ CREATE TABLE onek (
+ );
+ 
+ \set filename :abs_srcdir '/data/onek.data'
+-COPY onek FROM :'filename';
++\set command '\\copy onek FROM ' :'filename';
++:command
+ VACUUM ANALYZE onek;
+ 
+ CREATE TABLE onek2 AS SELECT * FROM onek;
+@@ -161,7 +162,8 @@ CREATE TABLE tenk1 (
+ );
+ 
+ \set filename :abs_srcdir '/data/tenk.data'
+-COPY tenk1 FROM :'filename';
++\set command '\\copy tenk1 FROM ' :'filename';
++:command
+ VACUUM ANALYZE tenk1;
+ 
+ CREATE TABLE tenk2 AS SELECT * FROM tenk1;
+@@ -174,7 +176,8 @@ CREATE TABLE person (
+ );
+ 
+ \set filename :abs_srcdir '/data/person.data'
+-COPY person FROM :'filename';
++\set command '\\copy person FROM ' :'filename';
++:command
+ VACUUM ANALYZE person;
+ 
+ CREATE TABLE emp (
+@@ -183,7 +186,8 @@ CREATE TABLE emp (
+ ) INHERITS (person);
+ 
+ \set filename :abs_srcdir '/data/emp.data'
+-COPY emp FROM :'filename';
++\set command '\\copy emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE emp;
+ 
+ CREATE TABLE student (
+@@ -191,7 +195,8 @@ CREATE TABLE student (
+ ) INHERITS (person);
+ 
+ \set filename :abs_srcdir '/data/student.data'
+-COPY student FROM :'filename';
++\set command '\\copy student FROM ' :'filename';
++:command
+ VACUUM ANALYZE student;
+ 
+ CREATE TABLE stud_emp (
+@@ -199,7 +204,8 @@ CREATE TABLE stud_emp (
+ ) INHERITS (emp, student);
+ 
+ \set filename :abs_srcdir '/data/stud_emp.data'
+-COPY stud_emp FROM :'filename';
++\set command '\\copy stud_emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE stud_emp;
+ 
+ CREATE TABLE road (
+@@ -208,7 +214,8 @@ CREATE TABLE road (
+ );
+ 
+ \set filename :abs_srcdir '/data/streets.data'
+-COPY road FROM :'filename';
++\set command '\\copy road FROM ' :'filename';
++:command
+ VACUUM ANALYZE road;
+ 
+ CREATE TABLE ihighway () INHERITS (road);
+diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
+index fbd26cdba4..7ec2d78eee 100644
+--- a/src/test/regress/sql/tsearch.sql
++++ b/src/test/regress/sql/tsearch.sql
+@@ -49,7 +49,8 @@ CREATE TABLE test_tsvector(
+ );
+ 
+ \set filename :abs_srcdir '/data/tsearch.data'
+-COPY test_tsvector FROM :'filename';
++\set command '\\copy test_tsvector FROM ' :'filename';
++:command
+ 
+ ANALYZE test_tsvector;
+ 
+diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql
+index 0a3176e25d..7744ef68f5 100644
+--- a/src/test/regress/sql/updatable_views.sql
++++ b/src/test/regress/sql/updatable_views.sql
+@@ -425,9 +425,9 @@ DROP TABLE base_tbl CASCADE;
+ 
+ -- permissions checks
+ 
+-CREATE USER regress_view_user1;
+-CREATE USER regress_view_user2;
+-CREATE USER regress_view_user3;
++CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SET SESSION AUTHORIZATION regress_view_user1;
+ CREATE TABLE base_tbl(a int, b text, c float);
+@@ -1586,8 +1586,8 @@ drop view uv_iocu_view;
+ drop table uv_iocu_tab;
+ 
+ -- ON CONFLICT DO UPDATE permissions checks
+-create user regress_view_user1;
+-create user regress_view_user2;
++create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ set session authorization regress_view_user1;
+ create table base_tbl(a int unique, b text, c float);
+diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
+index 7a7bee77b9..07b480cd59 100644
+--- a/src/test/regress/sql/update.sql
++++ b/src/test/regress/sql/update.sql
+@@ -339,7 +339,7 @@ DROP FUNCTION func_parted_mod_b();
+ -----------------------------------------
+ 
+ ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+-CREATE USER regress_range_parted_user;
++CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+ CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+ CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql
+index ae36b54641..5612b8e162 100644
+--- a/src/test/regress/sql/vacuum.sql
++++ b/src/test/regress/sql/vacuum.sql
+@@ -335,7 +335,7 @@ CREATE TABLE vacowned (a int);
+ CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a);
+ CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1);
+ CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2);
+-CREATE ROLE regress_vacuum;
++CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_vacuum;
+ -- Simple table
+ VACUUM vacowned;
diff --git a/test_runner/cloud_regress/test_cloud_regress.py b/test_runner/cloud_regress/test_cloud_regress.py
new file mode 100644
index 0000000000..de71357232
--- /dev/null
+++ b/test_runner/cloud_regress/test_cloud_regress.py
@@ -0,0 +1,100 @@
+"""
+Run the regression tests on the cloud instance of Neon
+"""
+
+from pathlib import Path
+from typing import Any
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.pg_version import PgVersion
+
+
+@pytest.fixture
+def setup(remote_pg: RemotePostgres):
+    """
+    Setup and teardown of the tests
+    """
+    with psycopg2.connect(remote_pg.connstr()) as conn:
+        with conn.cursor() as cur:
+            log.info("Creating the extension")
+            cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
+            conn.commit()
+            # TODO: Migrate to branches and remove this code
+            log.info("Looking for subscriptions in the regress database")
+            cur.execute(
+                "SELECT subname FROM pg_catalog.pg_subscription WHERE "
+                "subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
+            )
+            if cur.rowcount > 0:
+                with psycopg2.connect(
+                    dbname="regression",
+                    host=remote_pg.default_options["host"],
+                    user=remote_pg.default_options["user"],
+                    password=remote_pg.default_options["password"],
+                ) as regress_conn:
+                    with regress_conn.cursor() as regress_cur:
+                        for sub in cur:
+                            regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
+                            regress_cur.execute(
+                                f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
+                            )
+                            regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
+                        regress_conn.commit()
+
+    yield
+    # TODO: Migrate to branches and remove this code
+    log.info("Looking for extra roles...")
+    with psycopg2.connect(remote_pg.connstr()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                "SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
+            )
+            roles: list[Any] = []
+            for role in cur:
+                log.info("Role found: %s", role[0])
+                roles.append(role[0])
+            for role in roles:
+                cur.execute(f"DROP ROLE {role}")
+            conn.commit()
+
+
+@pytest.mark.timeout(7200)
+@pytest.mark.remote_cluster
+def test_cloud_regress(
+    setup,
+    remote_pg: RemotePostgres,
+    pg_version: PgVersion,
+    pg_distrib_dir: Path,
+    base_dir: Path,
+    test_output_dir: Path,
+):
+    """
+    Run the regression tests
+    """
+    regress_bin = (
+        pg_distrib_dir / f"{pg_version.v_prefixed}/lib/postgresql/pgxs/src/test/regress/pg_regress"
+    )
+    test_path = base_dir / f"vendor/postgres-{pg_version.v_prefixed}/src/test/regress"
+
+    env_vars = {
+        "PGHOST": remote_pg.default_options["host"],
+        "PGPORT": str(
+            remote_pg.default_options["port"] if "port" in remote_pg.default_options else 5432
+        ),
+        "PGUSER": remote_pg.default_options["user"],
+        "PGPASSWORD": remote_pg.default_options["password"],
+        "PGDATABASE": remote_pg.default_options["dbname"],
+    }
+    regress_cmd = [
+        str(regress_bin),
+        f"--inputdir={test_path}",
+        f"--bindir={pg_distrib_dir}/{pg_version.v_prefixed}/bin",
+        "--dlpath=/usr/local/lib",
+        "--max-concurrent-tests=20",
+        f"--schedule={test_path}/parallel_schedule",
+        "--max-connections=5",
+    ]
+    remote_pg.pg_bin.run(regress_cmd, env=env_vars, cwd=test_output_dir)
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 80f1c9e4e3..10e8412b19 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -236,7 +236,7 @@ def get_scale_for_db(size_mb: int) -> int:
 
 
 ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
+    r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
 )
 
 
From 9490360df428aa7183034a396e8018607f3c4159 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 24 Sep 2024 10:03:41 +0100
Subject: [PATCH 128/142] storcon: improve initial shard scheduling (#9081)

## Problem

Scheduling on tenant creation uses different heuristics compared to the
scheduling done during
background optimizations. This results in scenarios where shards are
created and then immediately
migrated by the optimizer.

## Summary of changes

1. Make scheduler aware of the type of the shard it is scheduling
(attached vs secondary).
We wish to have different heuristics.
2. For attached shards, include the attached shard count from the
context in the node score
calculation. This brings initial shard scheduling in line with what the
optimization passes do.
3. Add a test for (2).

This looks like a bigger change than required, but the refactoring
serves as the basis for az-aware
shard scheduling where we also need to make the distinction between
attached and secondary shards.

Closes https://github.com/neondatabase/neon/issues/8969
---
 storage_controller/src/scheduler.rs    | 203 +++++++++++++++++++++----
 storage_controller/src/service.rs      |   8 +-
 storage_controller/src/tenant_shard.rs |  82 ++++++++--
 3 files changed, 247 insertions(+), 46 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index deb5f27226..1cb1fb104d 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -2,7 +2,7 @@ use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
 use pageserver_api::models::PageserverUtilization;
 use serde::Serialize;
-use std::collections::HashMap;
+use std::{collections::HashMap, fmt::Debug};
 use utils::{http::error::ApiError, id::NodeId};
 
 /// Scenarios in which we cannot find a suitable location for a tenant shard
@@ -27,7 +27,7 @@ pub enum MaySchedule {
 }
 
 #[derive(Serialize)]
-struct SchedulerNode {
+pub(crate) struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
     shard_count: usize,
     /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
@@ -38,6 +38,137 @@ struct SchedulerNode {
     may_schedule: MaySchedule,
 }
 
+pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self>;
+    fn is_overloaded(&self) -> bool;
+    fn node_id(&self) -> NodeId;
+}
+
+pub(crate) trait ShardTag {
+    type Score: NodeSchedulingScore;
+}
+
+pub(crate) struct AttachedShardTag {}
+impl ShardTag for AttachedShardTag {
+    type Score = NodeAttachmentSchedulingScore;
+}
+
+pub(crate) struct SecondaryShardTag {}
+impl ShardTag for SecondaryShardTag {
+    type Score = NodeSecondarySchedulingScore;
+}
+
+/// Scheduling score of a given node for shard attachments.
+/// Lower scores indicate more suitable nodes.
+/// Ordering is given by member declaration order (top to bottom).
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct NodeAttachmentSchedulingScore {
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
+    /// Size of [`ScheduleContext::attached_nodes`] for the current node.
+    /// This normally tracks the number of attached shards belonging to the
+    /// tenant being scheduled that are already on this node.
+    attached_shards_in_context: usize,
+    /// Utilisation score that combines shard count and disk utilisation
+    utilization_score: u64,
+    /// Total number of shards attached to this node. When nodes have identical utilisation, this
+    /// acts as an anti-affinity between attached shards.
+    total_attached_shard_count: usize,
+    /// Convenience to make selection deterministic in tests and empty systems
+    node_id: NodeId,
+}
+
+impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self> {
+        let utilization = match &mut node.may_schedule {
+            MaySchedule::Yes(u) => u,
+            MaySchedule::No => {
+                return None;
+            }
+        };
+
+        Some(Self {
+            affinity_score: context
+                .nodes
+                .get(node_id)
+                .copied()
+                .unwrap_or(AffinityScore::FREE),
+            attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
+            utilization_score: utilization.cached_score(),
+            total_attached_shard_count: node.attached_shard_count,
+            node_id: *node_id,
+        })
+    }
+
+    fn is_overloaded(&self) -> bool {
+        PageserverUtilization::is_overloaded(self.utilization_score)
+    }
+
+    fn node_id(&self) -> NodeId {
+        self.node_id
+    }
+}
+
+/// Scheduling score of a given node for shard secondaries.
+/// Lower scores indicate more suitable nodes.
+/// Ordering is given by member declaration order (top to bottom).
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct NodeSecondarySchedulingScore {
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
+    /// Utilisation score that combines shard count and disk utilisation
+    utilization_score: u64,
+    /// Total number of shards attached to this node. When nodes have identical utilisation, this
+    /// acts as an anti-affinity between attached shards.
+    total_attached_shard_count: usize,
+    /// Convenience to make selection deterministic in tests and empty systems
+    node_id: NodeId,
+}
+
+impl NodeSchedulingScore for NodeSecondarySchedulingScore {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self> {
+        let utilization = match &mut node.may_schedule {
+            MaySchedule::Yes(u) => u,
+            MaySchedule::No => {
+                return None;
+            }
+        };
+
+        Some(Self {
+            affinity_score: context
+                .nodes
+                .get(node_id)
+                .copied()
+                .unwrap_or(AffinityScore::FREE),
+            utilization_score: utilization.cached_score(),
+            total_attached_shard_count: node.attached_shard_count,
+            node_id: *node_id,
+        })
+    }
+
+    fn is_overloaded(&self) -> bool {
+        PageserverUtilization::is_overloaded(self.utilization_score)
+    }
+
+    fn node_id(&self) -> NodeId {
+        self.node_id
+    }
+}
+
 impl PartialEq for SchedulerNode {
     fn eq(&self, other: &Self) -> bool {
         let may_schedule_matches = matches!(
@@ -406,6 +537,28 @@ impl Scheduler {
         node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
     }
 
+    /// Compute a schedulling score for each node that the scheduler knows of
+    /// minus a set of hard excluded nodes.
+    fn compute_node_scores<Score>(
+        &mut self,
+        hard_exclude: &[NodeId],
+        context: &ScheduleContext,
+    ) -> Vec<Score>
+    where
+        Score: NodeSchedulingScore,
+    {
+        self.nodes
+            .iter_mut()
+            .filter_map(|(k, v)| {
+                if hard_exclude.contains(k) {
+                    None
+                } else {
+                    Score::generate(k, v, context)
+                }
+            })
+            .collect()
+    }
+
     /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
     /// are already in use by this shard -- we use this to avoid picking the same node
     /// as both attached and secondary location.  This is a hard constraint: if we cannot
@@ -415,7 +568,7 @@ impl Scheduler {
     /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
     /// the same tenant on the same node.  This is a soft constraint: the context will never
     /// cause us to fail to schedule a shard.
-    pub(crate) fn schedule_shard(
+    pub(crate) fn schedule_shard<Tag: ShardTag>(
         &mut self,
         hard_exclude: &[NodeId],
         context: &ScheduleContext,
@@ -424,20 +577,7 @@ impl Scheduler {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
-            .nodes
-            .iter_mut()
-            .filter_map(|(k, v)| match &mut v.may_schedule {
-                MaySchedule::No => None,
-                MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
-                MaySchedule::Yes(utilization) => Some((
-                    *k,
-                    context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                    utilization.cached_score(),
-                    v.attached_shard_count,
-                )),
-            })
-            .collect();
+        let mut scores = self.compute_node_scores::<Tag::Score>(hard_exclude, context);
 
         // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
         // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
@@ -445,20 +585,18 @@ impl Scheduler {
         // overloaded.
         let non_overloaded_scores = scores
             .iter()
-            .filter(|i| !PageserverUtilization::is_overloaded(i.2))
+            .filter(|i| !i.is_overloaded())
             .copied()
             .collect::<Vec<_>>();
         if !non_overloaded_scores.is_empty() {
             scores = non_overloaded_scores;
         }
 
-        // Sort by, in order of precedence:
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization score (this combines shard count and disk utilization)
-        //  3rd: Attached shard count.  When nodes have identical utilization (e.g. when populating some
-        //       empty nodes), this acts as an anti-affinity between attached shards.
-        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
+        // Sort the nodes by score. The one with the lowest scores will be the preferred node.
+        // Refer to [`NodeAttachmentSchedulingScore`] for attached locations and
+        // [`NodeSecondarySchedulingScore`] for secondary locations to understand how the nodes
+        // are ranked.
+        scores.sort();
 
         if scores.is_empty() {
             // After applying constraints, no pageservers were left.
@@ -481,12 +619,12 @@ impl Scheduler {
         }
 
         // Lowest score wins
-        let node_id = scores.first().unwrap().0;
+        let node_id = scores.first().unwrap().node_id();
 
         if !matches!(context.mode, ScheduleMode::Speculative) {
             tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
         );
         }
 
@@ -556,9 +694,9 @@ mod tests {
 
         let context = ScheduleContext::default();
 
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
         t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
         t2_intent.set_attached(&mut scheduler, Some(scheduled));
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -567,7 +705,8 @@ mod tests {
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
 
-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled =
+            scheduler.schedule_shard::<AttachedShardTag>(&t1_intent.all_pageservers(), &context)?;
         t1_intent.push_secondary(&mut scheduler, scheduled);
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -621,7 +760,9 @@ mod tests {
             scheduler: &mut Scheduler,
             context: &ScheduleContext,
         ) {
-            let scheduled = scheduler.schedule_shard(&[], context).unwrap();
+            let scheduled = scheduler
+                .schedule_shard::<AttachedShardTag>(&[], context)
+                .unwrap();
             let mut intent = IntentState::new();
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 957f633feb..5555505b81 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -26,7 +26,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
+    scheduler::{AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
         ScheduleOptimizationAction,
@@ -2629,7 +2629,8 @@ impl Service {
             let scheduler = &mut locked.scheduler;
             // Right now we only perform the operation on a single node without parallelization
             // TODO fan out the operation to multiple nodes for better performance
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
             let node = locked
                 .nodes
                 .get(&node_id)
@@ -2815,7 +2816,8 @@ impl Service {
 
             // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
             // was attached, just has to be able to see the S3 content)
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
             let node = nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while lock is active");
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index cdb0633e2b..1f5eb423be 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -8,7 +8,10 @@ use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
     reconciler::{ReconcileUnits, ReconcilerConfig},
-    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
+    scheduler::{
+        AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext,
+        SecondaryShardTag,
+    },
     service::ReconcileResultRequest,
 };
 use pageserver_api::controller_api::{
@@ -335,19 +338,19 @@ pub(crate) enum ReconcileWaitError {
     Failed(TenantShardId, Arc<ReconcileError>),
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct ReplaceSecondary {
     old_node_id: NodeId,
     new_node_id: NodeId,
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct MigrateAttachment {
     pub(crate) old_attached_node_id: NodeId,
     pub(crate) new_attached_node_id: NodeId,
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) enum ScheduleOptimizationAction {
     // Replace one of our secondary locations with a different node
     ReplaceSecondary(ReplaceSecondary),
@@ -355,7 +358,7 @@ pub(crate) enum ScheduleOptimizationAction {
     MigrateAttachment(MigrateAttachment),
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct ScheduleOptimization {
     // What was the reconcile sequence when we generated this optimization?  The optimization
     // should only be applied if the shard's sequence is still at this value, in case other changes
@@ -537,7 +540,8 @@ impl TenantShard {
             Ok((true, promote_secondary))
         } else {
             // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&self.intent.secondary, context)?;
             tracing::debug!("Selected {} as attached", node_id);
             self.intent.set_attached(scheduler, Some(node_id));
             Ok((true, node_id))
@@ -613,7 +617,8 @@ impl TenantShard {
 
                 let mut used_pageservers = vec![attached_node_id];
                 while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler
+                        .schedule_shard::<SecondaryShardTag>(&used_pageservers, context)?;
                     self.intent.push_secondary(scheduler, node_id);
                     used_pageservers.push(node_id);
                     modified = true;
@@ -626,7 +631,7 @@ impl TenantShard {
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(&[], context)?;
                     self.intent.push_secondary(scheduler, node_id);
                     modified = true;
                 }
@@ -803,9 +808,10 @@ impl TenantShard {
             // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
             // This implicitly limits the choice to nodes that are available, and prefers nodes
             // with lower utilization.
-            let Ok(candidate_node) =
-                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
-            else {
+            let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
+                &self.intent.all_pageservers(),
+                schedule_context,
+            ) else {
                 // A scheduling error means we have no possible candidate replacements
                 continue;
             };
@@ -1333,6 +1339,8 @@ impl TenantShard {
 
 #[cfg(test)]
 pub(crate) mod tests {
+    use std::{cell::RefCell, rc::Rc};
+
     use pageserver_api::{
         controller_api::NodeAvailability,
         shard::{ShardCount, ShardNumber},
@@ -1637,12 +1645,14 @@ pub(crate) mod tests {
 
     // Optimize til quiescent: this emulates what Service::optimize_all does, when
     // called repeatedly in the background.
+    // Returns the applied optimizations
     fn optimize_til_idle(
         nodes: &HashMap<NodeId, Node>,
         scheduler: &mut Scheduler,
         shards: &mut [TenantShard],
-    ) {
+    ) -> Vec<ScheduleOptimization> {
         let mut loop_n = 0;
+        let mut optimizations = Vec::default();
         loop {
             let mut schedule_context = ScheduleContext::default();
             let mut any_changed = false;
@@ -1657,6 +1667,7 @@ pub(crate) mod tests {
             for shard in shards.iter_mut() {
                 let optimization = shard.optimize_attachment(nodes, &schedule_context);
                 if let Some(optimization) = optimization {
+                    optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
                     break;
@@ -1664,6 +1675,7 @@ pub(crate) mod tests {
 
                 let optimization = shard.optimize_secondary(scheduler, &schedule_context);
                 if let Some(optimization) = optimization {
+                    optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
                     break;
@@ -1678,6 +1690,8 @@ pub(crate) mod tests {
             loop_n += 1;
             assert!(loop_n < 1000);
         }
+
+        optimizations
     }
 
     /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
@@ -1730,4 +1744,48 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    /// Test that initial shard scheduling is optimal. By optimal we mean
+    /// that the optimizer cannot find a way to improve it.
+    ///
+    /// This test is an example of the scheduling issue described in
+    /// https://github.com/neondatabase/neon/issues/8969
+    #[test]
+    fn initial_scheduling_is_optimal() -> anyhow::Result<()> {
+        use itertools::Itertools;
+
+        let nodes = make_test_nodes(2);
+
+        let mut scheduler = Scheduler::new([].iter());
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+
+        let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let a_context = Rc::new(RefCell::new(ScheduleContext::default()));
+
+        let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let b_context = Rc::new(RefCell::new(ScheduleContext::default()));
+
+        let a_shards_with_context = a.iter_mut().map(|shard| (shard, a_context.clone()));
+        let b_shards_with_context = b.iter_mut().map(|shard| (shard, b_context.clone()));
+
+        let schedule_order = a_shards_with_context.interleave(b_shards_with_context);
+
+        for (shard, context) in schedule_order {
+            let context = &mut *context.borrow_mut();
+            shard.schedule(&mut scheduler, context).unwrap();
+        }
+
+        let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a);
+        assert_eq!(applied_to_a, vec![]);
+
+        let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b);
+        assert_eq!(applied_to_b, vec![]);
+
+        for shard in a.iter_mut().chain(b.iter_mut()) {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
 }

From 2b65a2b53eb065a7f664564adbfa04e74d422b9c Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 24 Sep 2024 11:52:25 +0200
Subject: [PATCH 129/142] proxy: check if IP is allowed during webauth flow
 (#9101)

neondatabase/cloud#12018
---
 proxy/src/auth/backend.rs     |  2 +-
 proxy/src/auth/backend/web.rs | 10 ++++++++++
 proxy/src/console/messages.rs | 19 +++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5bc2f2ff65..4e9f4591ad 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -444,7 +444,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
             Self::Web(url, ()) => {
                 info!("performing web authentication");
 
-                let info = web::authenticate(ctx, &url, client).await?;
+                let info = web::authenticate(ctx, config, &url, client).await?;
 
                 Backend::Web(url, info)
             }
diff --git a/proxy/src/auth/backend/web.rs b/proxy/src/auth/backend/web.rs
index 58a4bef62e..05f437355e 100644
--- a/proxy/src/auth/backend/web.rs
+++ b/proxy/src/auth/backend/web.rs
@@ -1,5 +1,6 @@
 use crate::{
     auth, compute,
+    config::AuthenticationConfig,
     console::{self, provider::NodeInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
@@ -58,6 +59,7 @@ pub(crate) fn new_psql_session_id() -> String {
 
 pub(super) async fn authenticate(
     ctx: &RequestMonitoring,
+    auth_config: &'static AuthenticationConfig,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -89,6 +91,14 @@ pub(super) async fn authenticate(
     info!(parent: &span, "waiting for console's reply...");
     let db_info = waiter.await.map_err(WebAuthError::from)?;
 
+    if auth_config.ip_allowlist_check_enabled {
+        if let Some(allowed_ips) = &db_info.allowed_ips {
+            if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) {
+                return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+            }
+        }
+    }
+
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
     // This config should be self-contained, because we won't
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 9b66333cd4..85683acb82 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -284,6 +284,8 @@ pub(crate) struct DatabaseInfo {
     /// be inconvenient for debug with local PG instance.
     pub(crate) password: Option<Box<str>>,
     pub(crate) aux: MetricsAuxInfo,
+    #[serde(default)]
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -294,6 +296,7 @@ impl fmt::Debug for DatabaseInfo {
             .field("port", &self.port)
             .field("dbname", &self.dbname)
             .field("user", &self.user)
+            .field("allowed_ips", &self.allowed_ips)
             .finish_non_exhaustive()
     }
 }
@@ -432,6 +435,22 @@ mod tests {
             "aux": dummy_aux(),
         }))?;
 
+        // with allowed_ips
+        let dbinfo = serde_json::from_value::<DatabaseInfo>(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "aux": dummy_aux(),
+            "allowed_ips": ["127.0.0.1"],
+        }))?;
+
+        assert_eq!(
+            dbinfo.allowed_ips,
+            Some(vec![IpPattern::Single("127.0.0.1".parse()?)])
+        );
+
         Ok(())
     }
 

From fc67f8dc6087a0b4f4f0bcd74f6e1dc25fab8cf3 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 24 Sep 2024 14:15:52 +0200
Subject: [PATCH 130/142] Update PostgreSQL 17 from 17rc1 to 17.0 (#9119)

The PostgreSQL 17 vendor module is now based on postgres/postgres @
d7ec59a63d745ba74fba0e280bbf85dc6d1caa3e, presumably the final code
change before the V17 tag.
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 7b3e52c75c..68b5038f27 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 7b3e52c75ca384de9c69477c158b1f5dcdcbb4be
+Subproject commit 68b5038f27e493bde6ae552fe066f10cbdfe6a14
diff --git a/vendor/revisions.json b/vendor/revisions.json
index bc7070744a..896a75814e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
-    "17rc1",
-    "7b3e52c75ca384de9c69477c158b1f5dcdcbb4be"
+    "17.0",
+    "68b5038f27e493bde6ae552fe066f10cbdfe6a14"
   ],
   "v16": [
     "16.4",

From a65d4379309e29a23f9e3544988712b33a89a75a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 24 Sep 2024 15:05:07 +0200
Subject: [PATCH 131/142] chore(#9077): cleanups & code dedup (#9082)

Punted from https://github.com/neondatabase/neon/pull/9077
---
 pageserver/src/metrics.rs      | 33 +++++++++++++--------------------
 pageserver/src/tenant/tasks.rs |  3 +--
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 162e8d1836..366bd82903 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3208,45 +3208,38 @@ pub(crate) mod tenant_throttling {
 
     impl TimelineGet {
         pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            let per_tenant_label_values = &[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ];
             TimelineGet {
                 count_accounted_start: {
                     GlobalAndPerTenantIntCounter {
                         global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_accounted_finish: {
                     GlobalAndPerTenantIntCounter {
                         global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
                 wait_time: {
                     GlobalAndPerTenantIntCounter {
                         global: WAIT_USECS.with_label_values(&[KIND]),
-                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: WAIT_USECS_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_throttled: {
                     GlobalAndPerTenantIntCounter {
                         global: WAIT_COUNT.with_label_values(&[KIND]),
-                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: WAIT_COUNT_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
             }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 341febb30a..3f0f8a21c8 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -481,8 +481,7 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                 let allowed_rps = tenant.timeline_get_throttle.steady_rps();
                 let delta = now - prev;
                 info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
+                    n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
                     count_accounted = count_accounted_finish,  // don't break existing log scraping
                     count_throttled,
                     sum_throttled_usecs,

From b224a5a37734d05ffc88143750352eb318cba90d Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:13:18 +0200
Subject: [PATCH 132/142] Move the patch to compute (#9120)

## Problem
All the other patches were moved to the compute directory, and only one
was left in the patches subdirectory in the root directory.

## Summary of changes
The patch was moved to the compute directory as others
---
 .github/workflows/cloud-regress.yml                   | 2 +-
 {patches => compute/patches}/cloud_regress_pg16.patch | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename {patches => compute/patches}/cloud_regress_pg16.patch (100%)

diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index de6babdde3..ecafe183f8 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Patch the test
         run: |
           cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
-          patch -p1 < "../../patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+          patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
 
       - name: Generate a random password
         id: pwgen
diff --git a/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch
similarity index 100%
rename from patches/cloud_regress_pg16.patch
rename to compute/patches/cloud_regress_pg16.patch

From 70fe0075192d5bc4cbfec5f472ca466d0df477b9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 16:41:59 +0300
Subject: [PATCH 133/142] test: Make test_hot_standby_feedback more forgiving
 of slow initialization (#9113)

Don't start waiting for the index to appear in the secondary until it
has been created in the primary. Before, if the "pgbench -i" step took
more than 60 s, we would give up.

There was a flaky test failure along those lines at:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9105/10997477941/index.html#suites/950eff205b552e248417890b8b8f189e/73cf4b5648fa6f74/
Hopefully, this avoids such failures in the future.
---
 test_runner/regress/test_hot_standby.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 35e0c0decb..be8f70bb70 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -198,9 +198,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
 
 def run_pgbench(connstr: str, pg_bin: PgBin):
     log.info(f"Start a pgbench workload on pg {connstr}")
-    # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
-    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr])
-    log.info("pgbench init done")
     pg_bin.run_capture(["pgbench", "-T60", connstr])
 
 
@@ -247,9 +244,15 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             log.info(
                 f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
             )
+
+            # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
+            pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", primary.connstr()])
+            log.info("pgbench init done in primary")
+
             t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
             t.start()
-            # Wait until pgbench_accounts is created + filled on replica *and*
+
+            # Wait until we see that the pgbench_accounts is created + filled on replica *and*
             # index is created. Otherwise index creation would conflict with
             # read queries and hs feedback won't save us.
             wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))

From 589594c2e1447632b28d31ec69602782ce4634d7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:48:41 +0300
Subject: [PATCH 134/142] test: Skip fsync when initdb'ing the storage
 controller db

After initdb, we configure it with "fsync=off" anyway.
---
 control_plane/src/storage_controller.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 2b714fbfbf..0c0e67dff0 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -346,7 +346,14 @@ impl StorageController {
             let pg_log_path = pg_data_path.join("postgres.log");
 
             if !tokio::fs::try_exists(&pg_data_path).await? {
-                let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
+                let initdb_args = [
+                    "-D",
+                    pg_data_path.as_ref(),
+                    "--username",
+                    &username(),
+                    "--no-sync",
+                    "--no-instructions",
+                ];
                 tracing::info!(
                     "Initializing storage controller database with args: {:?}",
                     initdb_args

From 2f7cecaf6a92e29df0a576b793820899e889ba81 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:48:43 +0300
Subject: [PATCH 135/142] test: Poll pageserver availability more aggressively
 at test startup

Even with the 100 ms interval, on my laptop the pageserver always
becomes available on second attempt, so this saves about 900 ms at
every test startup.
---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 55c1423ed0..8c178ae63a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2553,7 +2553,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         desired_availability: Optional[PageserverAvailability],
         desired_scheduling_policy: Optional[PageserverSchedulingPolicy],
         max_attempts: int,
-        backoff: int,
+        backoff: float,
     ):
         """
         Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability'
@@ -2948,7 +2948,7 @@ class NeonPageserver(PgProtocol, LogUtils):
             self.id
         ):
             self.env.storage_controller.poll_node_status(
-                self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1
+                self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1
             )
 
         return self

From 4f67b0225bb946c32f5b9c8d1d96eafbb05295ca Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:41:38 -0400
Subject: [PATCH 136/142] pageserver: handle decompression outside vectored
 `read_blobs` (#8942)

Part of #8130.

## Problem

Currently, decompression is performed within the `read_blobs`
implementation and the decompressed blob will be appended to the end of
the `BytesMut` buffer. We will lose this flexibility of extending the
buffer when we switch to using our own dio-aligned buffer (WIP in
https://github.com/neondatabase/neon/pull/8730). To facilitate the
adoption of aligned buffer, we need to refactor the code to perform
decompression outside `read_blobs`.

## Summary of changes

- `VectoredBlobReader::read_blobs` will return `VectoredBlob` without
performing decompression and appending decompressed blob. It becomes the
caller's responsibility to decompress the buffer.
- Added a new `BufView` type that functions as `Cow<Bytes, &[u8]>`.
- Perform decompression within `VectoredBlob::read` so that people don't
have to explicitly thinking about compression when using the reader
interface.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  58 +++++--
 .../src/tenant/storage_layer/image_layer.rs   |  41 +++--
 pageserver/src/tenant/vectored_blob_io.rs     | 162 ++++++++++++++----
 3 files changed, 200 insertions(+), 61 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 34f1b15138..2b212cfed5 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
@@ -1021,13 +1021,30 @@ impl DeltaLayerInner {
                     continue;
                 }
             };
-
+            let view = BufView::new_slice(&blobs_buf.buf);
             for meta in blobs_buf.blobs.iter().rev() {
                 if Some(meta.meta.key) == ignore_key_with_err {
                     continue;
                 }
+                let blob_read = meta.read(&view).await;
+                let blob_read = match blob_read {
+                    Ok(buf) => buf,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::Other(anyhow!(e).context(format!(
+                                "Failed to decompress blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                let value = Value::des(&blob_read);
 
-                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
                 let value = match value {
                     Ok(v) => v,
                     Err(e) => {
@@ -1243,21 +1260,21 @@ impl DeltaLayerInner {
                 buf.reserve(read.size());
                 let res = reader.read_blobs(&read, buf, ctx).await?;
 
+                let view = BufView::new_slice(&res.buf);
+
                 for blob in res.blobs {
                     let key = blob.meta.key;
                     let lsn = blob.meta.lsn;
-                    let data = &res.buf[blob.start..blob.end];
+
+                    let data = blob.read(&view).await?;
 
                     #[cfg(debug_assertions)]
-                    Value::des(data)
+                    Value::des(&data)
                         .with_context(|| {
                             format!(
-                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
-                                blob.meta.key,
-                                blob.meta.lsn,
-                                blob.start,
-                                blob.end,
-                                utils::Hex(data)
+                                "blob failed to deserialize for {}: {:?}",
+                                blob,
+                                utils::Hex(&data)
                             )
                         })
                         .unwrap();
@@ -1265,15 +1282,15 @@ impl DeltaLayerInner {
                     // is it an image or will_init walrecord?
                     // FIXME: this could be handled by threading the BlobRef to the
                     // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(data)
+                    let will_init = crate::repository::ValueBytes::will_init(&data)
                         .inspect_err(|_e| {
                             #[cfg(feature = "testing")]
-                            tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
+                            tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
                         })
                         .unwrap_or(false);
 
                     per_blob_copy.clear();
-                    per_blob_copy.extend_from_slice(data);
+                    per_blob_copy.extend_from_slice(&data);
 
                     let (tmp, res) = writer
                         .put_value_bytes(
@@ -1538,8 +1555,11 @@ impl<'a> DeltaLayerIterator<'a> {
             .read_blobs(&plan, buf, self.ctx)
             .await?;
         let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
         for meta in blobs_buf.blobs.iter() {
-            let value = Value::des(&frozen_buf[meta.start..meta.end])?;
+            let blob_read = meta.read(&view).await?;
+            let value = Value::des(&blob_read)?;
+
             next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
         }
         self.key_values_batch = next_batch;
@@ -1916,9 +1936,13 @@ pub(crate) mod test {
                 let blobs_buf = vectored_blob_reader
                     .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                     .await?;
+                let view = BufView::new_slice(&blobs_buf.buf);
                 for meta in blobs_buf.blobs.iter() {
-                    let value = &blobs_buf.buf[meta.start..meta.end];
-                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
+                    let value = meta.read(&view).await?;
+                    assert_eq!(
+                        &value[..],
+                        &entries_meta.index[&(meta.meta.key, meta.meta.lsn)]
+                    );
                 }
 
                 buf = Some(blobs_buf.buf);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 5de2582ab7..940d169db0 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -36,7 +36,8 @@ use crate::tenant::disk_btree::{
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
@@ -547,15 +548,15 @@ impl ImageLayerInner {
 
             let buf = BytesMut::with_capacity(buf_size);
             let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
             let frozen_buf = blobs_buf.buf.freeze();
+            let view = BufView::new_bytes(frozen_buf);
 
             for meta in blobs_buf.blobs.iter() {
-                let img_buf = frozen_buf.slice(meta.start..meta.end);
+                let img_buf = meta.read(&view).await?;
 
                 key_count += 1;
                 writer
-                    .put_image(meta.meta.key, img_buf, ctx)
+                    .put_image(meta.meta.key, img_buf.into_bytes(), ctx)
                     .await
                     .context(format!("Storing key {}", meta.meta.key))?;
             }
@@ -602,13 +603,28 @@ impl ImageLayerInner {
             match res {
                 Ok(blobs_buf) => {
                     let frozen_buf = blobs_buf.buf.freeze();
-
+                    let view = BufView::new_bytes(frozen_buf);
                     for meta in blobs_buf.blobs.iter() {
-                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        let img_buf = meta.read(&view).await;
+
+                        let img_buf = match img_buf {
+                            Ok(img_buf) => img_buf,
+                            Err(e) => {
+                                reconstruct_state.on_key_error(
+                                    meta.meta.key,
+                                    PageReconstructError::Other(anyhow!(e).context(format!(
+                                        "Failed to decompress blob from virtual file {}",
+                                        self.file.path,
+                                    ))),
+                                );
+
+                                continue;
+                            }
+                        };
                         reconstruct_state.update_key(
                             &meta.meta.key,
                             self.lsn,
-                            Value::Image(img_buf),
+                            Value::Image(img_buf.into_bytes()),
                         );
                     }
                 }
@@ -1025,10 +1041,15 @@ impl<'a> ImageLayerIterator<'a> {
         let blobs_buf = vectored_blob_reader
             .read_blobs(&plan, buf, self.ctx)
             .await?;
-        let frozen_buf: Bytes = blobs_buf.buf.freeze();
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
         for meta in blobs_buf.blobs.iter() {
-            let img_buf = frozen_buf.slice(meta.start..meta.end);
-            next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
+            let img_buf = meta.read(&view).await?;
+            next_batch.push_back((
+                meta.meta.key,
+                self.image_layer.lsn,
+                Value::Image(img_buf.into_bytes()),
+            ));
         }
         self.key_values_batch = next_batch;
         Ok(())
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 553edf6d8b..aa37a45898 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -16,8 +16,9 @@
 //! Note that the vectored blob api does *not* go through the page cache.
 
 use std::collections::BTreeMap;
+use std::ops::Deref;
 
-use bytes::BytesMut;
+use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -35,11 +36,123 @@ pub struct BlobMeta {
     pub lsn: Lsn,
 }
 
-/// Blob offsets into [`VectoredBlobsBuf::buf`]
+/// A view into the vectored blobs read buffer.
+#[derive(Clone, Debug)]
+pub(crate) enum BufView<'a> {
+    Slice(&'a [u8]),
+    Bytes(bytes::Bytes),
+}
+
+impl<'a> BufView<'a> {
+    /// Creates a new slice-based view on the blob.
+    pub fn new_slice(slice: &'a [u8]) -> Self {
+        Self::Slice(slice)
+    }
+
+    /// Creates a new [`bytes::Bytes`]-based view on the blob.
+    pub fn new_bytes(bytes: bytes::Bytes) -> Self {
+        Self::Bytes(bytes)
+    }
+
+    /// Convert the view into `Bytes`.
+    ///
+    /// If using slice as the underlying storage, the copy will be an O(n) operation.
+    pub fn into_bytes(self) -> Bytes {
+        match self {
+            BufView::Slice(slice) => Bytes::copy_from_slice(slice),
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+
+    /// Creates a sub-view of the blob based on the range.
+    fn view(&self, range: std::ops::Range<usize>) -> Self {
+        match self {
+            BufView::Slice(slice) => BufView::Slice(&slice[range]),
+            BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)),
+        }
+    }
+}
+
+impl<'a> Deref for BufView<'a> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+}
+
+impl<'a> AsRef<[u8]> for BufView<'a> {
+    fn as_ref(&self) -> &[u8] {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes.as_ref(),
+        }
+    }
+}
+
+impl<'a> From<&'a [u8]> for BufView<'a> {
+    fn from(value: &'a [u8]) -> Self {
+        Self::new_slice(value)
+    }
+}
+
+impl From<Bytes> for BufView<'_> {
+    fn from(value: Bytes) -> Self {
+        Self::new_bytes(value)
+    }
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed,
+/// subject to [`VectoredBlob::compression_bits`].
 pub struct VectoredBlob {
-    pub start: usize,
-    pub end: usize,
+    /// Blob metadata.
     pub meta: BlobMeta,
+    /// Start offset.
+    start: usize,
+    /// End offset.
+    end: usize,
+    /// Compression used on the the blob.
+    compression_bits: u8,
+}
+
+impl VectoredBlob {
+    /// Reads a decompressed view of the blob.
+    pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
+        let view = buf.view(self.start..self.end);
+
+        match self.compression_bits {
+            BYTE_UNCOMPRESSED => Ok(view),
+            BYTE_ZSTD => {
+                let mut decompressed_vec = Vec::new();
+                let mut decoder =
+                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
+                decoder.write_all(&view).await?;
+                decoder.flush().await?;
+                // Zero-copy conversion from `Vec` to `Bytes`
+                Ok(BufView::new_bytes(Bytes::from(decompressed_vec)))
+            }
+            bits => {
+                let error = std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
+                );
+                Err(error)
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for VectoredBlob {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}@{}, {}..{}",
+            self.meta.key, self.meta.lsn, self.start, self.end
+        )
+    }
 }
 
 /// Return type of [`VectoredBlobReader::read_blobs`]
@@ -514,7 +627,7 @@ impl<'a> VectoredBlobReader<'a> {
             );
         }
 
-        let mut buf = self
+        let buf = self
             .file
             .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
             .await?
@@ -529,9 +642,6 @@ impl<'a> VectoredBlobReader<'a> {
         // of a blob is implicit: the start of the next blob if one exists
         // or the end of the read.
 
-        // Some scratch space, put here for reusing the allocation
-        let mut decompressed_vec = Vec::new();
-
         for (blob_start, meta) in blobs_at {
             let blob_start_in_buf = blob_start - start_offset;
             let first_len_byte = buf[blob_start_in_buf as usize];
@@ -557,35 +667,14 @@ impl<'a> VectoredBlobReader<'a> {
                 )
             };
 
-            let start_raw = blob_start_in_buf + size_length;
-            let end_raw = start_raw + blob_size;
-            let (start, end);
-            if compression_bits == BYTE_UNCOMPRESSED {
-                start = start_raw as usize;
-                end = end_raw as usize;
-            } else if compression_bits == BYTE_ZSTD {
-                let mut decoder =
-                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
-                decoder
-                    .write_all(&buf[start_raw as usize..end_raw as usize])
-                    .await?;
-                decoder.flush().await?;
-                start = buf.len();
-                buf.extend_from_slice(&decompressed_vec);
-                end = buf.len();
-                decompressed_vec.clear();
-            } else {
-                let error = std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    format!("invalid compression byte {compression_bits:x}"),
-                );
-                return Err(error);
-            }
+            let start = (blob_start_in_buf + size_length) as usize;
+            let end = start + blob_size as usize;
 
             metas.push(VectoredBlob {
                 start,
                 end,
                 meta: *meta,
+                compression_bits,
             });
         }
 
@@ -1020,8 +1109,13 @@ mod tests {
             let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
             assert_eq!(result.blobs.len(), 1);
             let read_blob = &result.blobs[0];
-            let read_buf = &result.buf[read_blob.start..read_blob.end];
-            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
+            let view = BufView::new_slice(&result.buf);
+            let read_buf = read_blob.read(&view).await?;
+            assert_eq!(
+                &blob[..],
+                &read_buf[..],
+                "mismatch for idx={idx} at offset={offset}"
+            );
             buf = result.buf;
         }
         Ok(())

From c47f355ec1d35401d227f02518c24bb19d051085 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 24 Sep 2024 19:28:56 +0200
Subject: [PATCH 137/142] Catch Cancelled and don't print a warning for it
 (#9121)

In the `imitate_synthetic_size_calculation_worker` function, we might
obtain the `Cancelled` error variant instead of hitting the cancellation
token based path. Therefore, catch `Cancelled` and handle it analogously
to the cancellation case.

Fixes #8886.
---
 pageserver/src/tenant/timeline/eviction_task.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 2f6cb4d73a..26c2861b93 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,8 +30,8 @@ use crate::{
     pgdatadir_mapping::CollectKeySpaceError,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
-        storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
+        size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
+        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
     },
 };
 
@@ -557,6 +557,8 @@ impl Timeline {
             gather_result = gather => {
                 match gather_result {
                     Ok(_) => {},
+                    // It can happen sometimes that we hit this instead of the cancellation token firing above
+                    Err(CalculateSyntheticSizeError::Cancelled) => {}
                     Err(e) => {
                         // We don't care about the result, but, if it failed, we should log it,
                         // since consumption metric might be hitting the cached value and

From 523cf71721128ad6f58bfce3952fb33fe0086a8c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 24 Sep 2024 19:11:31 +0100
Subject: [PATCH 138/142] Fix compiler warnings on macOS (#9128)

## Problem

Compilation of neon extension on macOS produces a warning
```
pgxn/neon/neon_perf_counters.c:50:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
```

## Summary of changes
- Change the return type of `NeonPerfCountersShmemInit` to void
---
 pgxn/neon/neon_perf_counters.c | 2 +-
 pgxn/neon/neon_perf_counters.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
index 3e86d5b262..de653826c0 100644
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -32,7 +32,7 @@ NeonPerfCountersShmemSize(void)
 	return size;
 }
 
-bool
+void
 NeonPerfCountersShmemInit(void)
 {
 	bool		found;
diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h
index ae35e8c3a5..02163ada55 100644
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -105,7 +105,7 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared;
 extern void inc_getpage_wait(uint64 latency);
 
 extern Size NeonPerfCountersShmemSize(void);
-extern bool NeonPerfCountersShmemInit(void);
+extern void NeonPerfCountersShmemInit(void);
 
 
 #endif							/* NEON_PERF_COUNTERS_H */

From af5c54ed14f34dfee477659af39628c0d7ec3502 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 23:38:16 +0300
Subject: [PATCH 139/142] test: Make test_lfc_resize more robust (#9117)

1. Increase statement_timeout. It defaults to 120 s, which is not quite
enough on slow or busy systems with debug build. On my laptop, the index
creation takes about 100 s. On buildfarm, we've seen failures, e.g:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9084/10997888708/index.html#suites/821f97908a487f1d7d3a2a4dd1571e99/db1834bddfe8c5b9/

2. Keep twiddling the LFC size through the whole test. Before, we would
do it for the first 10 seconds, but that only covers a small part of the
pgbench initialization phase. Change the loop so that the pgbench run
time determines how long the test runs, and we keep changing the LFC for
the whole time.

In the passing, also fix bogus test description, copy-pasted from a
completely unrelated test.
---
 test_runner/regress/test_lfc_resize.py | 51 ++++++++++++++++++--------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index cb0b30d9c6..0f791e9247 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -10,11 +10,11 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, PgBin
 
 
-#
-# Test branching, when a transaction is in prepared state
-#
 @pytest.mark.timeout(600)
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    """
+    Test resizing the Local File Cache
+    """
     env = neon_simple_env
     endpoint = env.endpoints.create_start(
         "main",
@@ -32,27 +32,48 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
 
-    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
+    # Initializing the pgbench database can be very slow, especially on debug builds.
+    connstr = endpoint.connstr(options="-cstatement_timeout=300s")
+
+    thread = threading.Thread(target=run_pgbench, args=(connstr,), daemon=True)
     thread.start()
 
     conn = endpoint.connect()
     cur = conn.cursor()
 
-    for _ in range(n_resize):
+    # For as long as pgbench is running, twiddle the LFC size once a second.
+    # Note that we launch this immediately, already while the "pgbench -i"
+    # initialization step is still running. That's quite a different workload
+    # than the actual pgbench benchamark run, so this gives us coverage of both.
+    while thread.is_alive():
         size = random.randint(1, 512)
         cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
         cur.execute("select pg_reload_conf()")
         time.sleep(1)
-
-    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
-    cur.execute("select pg_reload_conf()")
-
     thread.join()
 
-    lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
-    lfc_file_size = os.path.getsize(lfc_file_path)
-    res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
-    lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
-    log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
-    assert lfc_file_size <= 512 * 1024 * 1024
+    # At the end, set it at 100 MB, and perform a final check that the disk usage
+    # of the file is in that ballbark.
+    #
+    # We retry the check a few times, because it might take a while for the
+    # system to react to changing the setting and shrinking the file.
+    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
+    cur.execute("select pg_reload_conf()")
+    nretries = 10
+    while True:
+        lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
+        lfc_file_size = os.path.getsize(lfc_file_path)
+        res = subprocess.run(
+            ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
+        )
+        lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
+        log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+        assert lfc_file_size <= 512 * 1024 * 1024
+
+        if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0:
+            break
+
+        nretries = nretries - 1
+        time.sleep(1)
+
     assert int(lfc_file_blocks) <= 128 * 1024

From 5cbf5b45ae337cc643812a2e6bb76e6eb79142e4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 23:58:54 +0300
Subject: [PATCH 140/142] Remove TenantState::Loading (#9118)

The last real use was removed in commit de90bf4663. It was still used in
a few unit tests, but they can use Attaching too.
---
 libs/pageserver_api/src/models.rs | 27 +++++----------------------
 pageserver/src/tenant.rs          | 21 +++++----------------
 2 files changed, 10 insertions(+), 38 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c9be53f0b0..45abda0ad8 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -37,14 +37,11 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 /// ```mermaid
 /// stateDiagram-v2
 ///
-///     [*] --> Loading: spawn_load()
 ///     [*] --> Attaching: spawn_attach()
 ///
-///     Loading --> Activating: activate()
 ///     Attaching --> Activating: activate()
 ///     Activating --> Active: infallible
 ///
-///     Loading --> Broken: load() failure
 ///     Attaching --> Broken: attach() failure
 ///
 ///     Active --> Stopping: set_stopping(), part of shutdown & detach
@@ -68,10 +65,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk.
-    ///
-    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
-    Loading,
     /// This tenant is being attached to the pageserver.
     ///
     /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
@@ -121,8 +114,6 @@ impl TenantState {
             // But, our attach task might still be fetching the remote timelines, etc.
             // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
             Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
-            // tenant mgr startup distinguishes attaching from loading via marker file.
-            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
             // We only reach Active after successful load / attach.
             // So, call atttachment status Attached.
             Self::Active => Attached,
@@ -191,10 +182,11 @@ impl LsnLease {
 }
 
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+///
+/// XXX: We used to have more variants here, but now it's just one, which makes this rather
+/// useless. Remove, once we've checked that there's no client code left that looks at this.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
-    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
-    Loading,
     /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
     Attaching,
 }
@@ -1562,11 +1554,8 @@ mod tests {
 
     #[test]
     fn tenantstatus_activating_serde() {
-        let states = [
-            TenantState::Activating(ActivatingFrom::Loading),
-            TenantState::Activating(ActivatingFrom::Attaching),
-        ];
-        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+        let states = [TenantState::Activating(ActivatingFrom::Attaching)];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
 
         let actual = serde_json::to_string(&states).unwrap();
 
@@ -1581,13 +1570,7 @@ mod tests {
     fn tenantstatus_activating_strum() {
         // tests added, because we use these for metrics
         let examples = [
-            (line!(), TenantState::Loading, "Loading"),
             (line!(), TenantState::Attaching, "Attaching"),
-            (
-                line!(),
-                TenantState::Activating(ActivatingFrom::Loading),
-                "Activating",
-            ),
             (
                 line!(),
                 TenantState::Activating(ActivatingFrom::Attaching),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5ed63734f4..53cbaea621 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1968,9 +1968,6 @@ impl Tenant {
                 TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                     panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                 }
-                TenantState::Loading => {
-                    *current_state = TenantState::Activating(ActivatingFrom::Loading);
-                }
                 TenantState::Attaching => {
                     *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                 }
@@ -2151,7 +2148,7 @@ impl Tenant {
     async fn set_stopping(
         &self,
         progress: completion::Barrier,
-        allow_transition_from_loading: bool,
+        _allow_transition_from_loading: bool,
         allow_transition_from_attaching: bool,
     ) -> Result<(), SetStoppingError> {
         let mut rx = self.state.subscribe();
@@ -2166,7 +2163,6 @@ impl Tenant {
                 );
                 false
             }
-            TenantState::Loading => allow_transition_from_loading,
             TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
         })
         .await
@@ -2185,13 +2181,6 @@ impl Tenant {
                 *current_state = TenantState::Stopping { progress };
                 true
             }
-            TenantState::Loading => {
-                if !allow_transition_from_loading {
-                    unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
-            }
             TenantState::Active => {
                 // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                 // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -2247,7 +2236,7 @@ impl Tenant {
         // The load & attach routines own the tenant state until it has reached `Active`.
         // So, wait until it's done.
         rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                 info!(
                     "waiting for {} to turn Active|Broken|Stopping",
                     <&'static str>::from(state)
@@ -2267,7 +2256,7 @@ impl Tenant {
         let reason = reason.to_string();
         self.state.send_modify(|current_state| {
             match *current_state {
-                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                TenantState::Activating(_) | TenantState::Attaching => {
                     unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
                 }
                 TenantState::Active => {
@@ -2311,7 +2300,7 @@ impl Tenant {
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
-                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
+                TenantState::Attaching | TenantState::Activating(_) => {
                     // in these states, there's a chance that we can reach ::Active
                     self.activate_now();
                     match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
@@ -4144,7 +4133,7 @@ pub(crate) mod harness {
             let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
 
             let tenant = Arc::new(Tenant::new(
-                TenantState::Loading,
+                TenantState::Attaching,
                 self.conf,
                 AttachedTenantConf::try_from(LocationConf::attached_single(
                     TenantConfOpt::from(self.tenant_conf.clone()),

From 938b163b42d614ecc747931e35380b27bf6e1e62 Mon Sep 17 00:00:00 2001
From: Damian972 <25445518+Damian972@users.noreply.github.com>
Date: Wed, 25 Sep 2024 00:05:23 +0200
Subject: [PATCH 141/142] chore(docker-compose): fix typo in readme (#9133)

Typo in the readme inside docker-compose folder

## Summary of changes
- Update the readme
---
 docker-compose/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker-compose/README.md b/docker-compose/README.md
index bd47805a67..648e4ca030 100644
--- a/docker-compose/README.md
+++ b/docker-compose/README.md
@@ -2,8 +2,8 @@
 # Example docker compose configuration
 
 The configuration in this directory is used for testing Neon docker images: it is
-not intended for deploying a usable system.  To run a development environment where
-you can experiment with a minature Neon system, use `cargo neon` rather than container images.
+not intended for deploying a usable system. To run a development environment where
+you can experiment with a miniature Neon system, use `cargo neon` rather than container images.
 
 This configuration does not start the storage controller, because the controller
 needs a way to reconfigure running computes, and no such thing exists in this setup.

From 5f2f31e87933be05bd93a239ddc66764ff877546 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 24 Sep 2024 18:33:03 -0400
Subject: [PATCH 142/142] fix(test): storage scrubber should only log to stdout
 with info (#9067)

As @koivunej mentioned in the storage channel, for regress test, we
don't need to create a log file for the scrubber, and we should reduce
noisy logs.

## Summary of changes

* Disable log file creation for storage scrubber
* Only log at info level

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8c178ae63a..201eb1087d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4617,7 +4617,8 @@ class StorageScrubber:
             "REGION": s3_storage.bucket_region,
             "BUCKET": s3_storage.bucket_name,
             "BUCKET_PREFIX": s3_storage.prefix_in_bucket,
-            "RUST_LOG": "DEBUG",
+            "RUST_LOG": "INFO",
+            "PAGESERVER_DISABLE_FILE_LOGGING": "1",
         }
         env.update(s3_storage.access_env_vars())
 
@@ -4637,10 +4638,8 @@ class StorageScrubber:
         (output_path, stdout, status_code) = subprocess_capture(
             self.log_dir,
             args,
-            echo_stderr=True,
-            echo_stdout=True,
             env=env,
-            check=False,
+            check=True,
             capture_stdout=True,
             timeout=timeout,
         )