From 5559b169535b67850129173e694e5297a5a1a960 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 22 Jan 2024 09:14:30 +0000
Subject: [PATCH 01/70] bump shlex (#6421)

## Problem

https://rustsec.org/advisories/RUSTSEC-2024-0006

## Summary of changes

`cargo update -p shlex`
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 75337481bb..952034a16b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5031,9 +5031,9 @@ dependencies = [
 
 [[package]]
 name = "shlex"
-version = "1.1.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "signal-hook"

From efdb2bf948e43f7cde867da12ca61682b276e960 Mon Sep 17 00:00:00 2001
From: hamishc <hamishc@users.noreply.github.com>
Date: Mon, 22 Jan 2024 22:05:27 +1100
Subject: [PATCH 02/70] Added missing PG_VERSION arg into compute node
 dockerfile (#6382)

## Problem

If you build the compute-node dockerfile with the PG_VERSION argument
passed in (e.g. `docker build -f Dockerfile.compute-node --build-arg
PG_VERSION=v15 .`, it fails, as some of stages doesn't have the
PG_VERSION arg defined.

## Summary of changes

Added the PG_VERSION arg to the plv8-build, neon-pg-ext-build, and
pg-embedding-pg-build stages of Dockerfile.compute-node
---
 Dockerfile.compute-node | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 908460018f..a5c1f3157d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -143,6 +143,8 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 #########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ARG PG_VERSION
 RUN apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
@@ -617,6 +619,7 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O
 FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -779,6 +782,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
+ARG PG_VERSION
+
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /

From 3290fb09bfacc3fca21a9a8d6be8a0237f8671bf Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:24:10 +0100
Subject: [PATCH 03/70] Proxy: fix gc (#6426)

## Problem

Gc currently doesn't work properly.

## Summary of changes

Change statement on running gc.
---
 proxy/src/cache/project_info.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 7af2118873..57d9e5289d 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -266,7 +266,7 @@ impl ProjectInfoCacheImpl {
             tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32);
         loop {
             interval.tick().await;
-            if self.cache.len() <= self.config.size {
+            if self.cache.len() < self.config.size {
                 // If there are not too many entries, wait until the next gc cycle.
                 continue;
             }

From 15c0df4de7ee71b43526d9850b47c9107efe303e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Jan 2024 15:27:29 +0100
Subject: [PATCH 04/70] fixup(#6037): actually fix the issue, #6388 failed to
 do so (#6429)

Before this patch, the select! still retured immediately if `futs` was
empty. Must have tested a stale build in my manual testing of #6388.
---
 pageserver/src/page_service.rs | 37 ++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1013131a99..77ce9981f0 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -386,12 +386,18 @@ impl PageServerHandler {
 
     /// Future that completes when we need to shut down the connection.
     ///
-    /// Reasons for need to shut down are:
-    /// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
-    /// - task_mgr requests shutdown of the connection
+    /// We currently need to shut down when any of the following happens:
+    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
+    /// 2. task_mgr requests shutdown of the connection
     ///
-    /// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
-    /// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
+    /// NB on (1): the connection's lifecycle is not actually tied to any of the
+    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
+    /// implementation to be responsive to timeline cancellation because
+    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
+    /// We currently do the easy thing and terminate the connection if any of the
+    /// shard_timelines gets cancelled. But really, we cuold spend more effort
+    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
+    /// dropping the guard.
     ///
     /// NB: keep in sync with [`Self::is_connection_cancelled`]
     async fn await_connection_cancelled(&self) {
@@ -404,16 +410,17 @@ impl PageServerHandler {
         // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
         // missing any inserts to the map.
 
-        let mut futs = self
-            .shard_timelines
-            .values()
-            .map(|ht| ht.timeline.cancel.cancelled())
-            .collect::<FuturesUnordered<_>>();
-
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => { }
-            _ = futs.next() => {}
-        }
+        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
+        use futures::future::Either;
+        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
+        cancellation_sources.extend(
+            self.shard_timelines
+                .values()
+                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
+        );
+        FuturesUnordered::from_iter(cancellation_sources)
+            .next()
+            .await;
     }
 
     /// Checking variant of [`Self::await_connection_cancelled`].

From 93572a3e99f572f51529b3fbb3b11dafa88f7f5c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Jan 2024 15:50:32 +0000
Subject: [PATCH 05/70] pageserver: mark tenant broken when cancelling attach
 (#6430)

## Problem

When a tenant is in Attaching state, and waiting for the
`concurrent_tenant_warmup` semaphore, it also listens for the tenant
cancellation token. When that token fires, Tenant::attach drops out.
Meanwhile, Tenant::set_stopping waits forever for the tenant to exit
Attaching state.

Fixes: https://github.com/neondatabase/neon/issues/6423

## Summary of changes

- In the absence of a valid state for the tenant, it is set to Broken in
this path. A more elegant solution will require more refactoring, beyond
this minimal fix.
---
 pageserver/src/tenant.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce99569beb..1d9b91c9ce 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -716,6 +716,10 @@ impl Tenant {
                             // stayed in Activating for such a long time that shutdown found it in
                             // that state.
                             tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
+                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
+                            // just shutting down), but ensures progress.
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                             return Ok(());
                         },
                     )

From 205b6111e614ae458e4a9774873846d3d3303a25 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Jan 2024 19:27:05 +0100
Subject: [PATCH 06/70] attachment_service: /attach-hook: correctly handle
 detach (#6433)

Before this patch, we would update the `tenant_state.intent` in memory
but not persist the detachment to disk.

I noticed this in https://github.com/neondatabase/neon/pull/6214 where
we stop, then restart, the attachment service.
---
 .../attachment_service/src/persistence.rs     | 24 ++++++++++++-------
 .../attachment_service/src/reconciler.rs      |  4 ++--
 .../attachment_service/src/service.rs         |  6 +++--
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 58708be140..db4f04d001 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -184,7 +184,7 @@ impl Persistence {
     pub(crate) async fn increment_generation(
         &self,
         tenant_shard_id: TenantShardId,
-        node_id: Option<NodeId>,
+        node_id: NodeId,
     ) -> anyhow::Result<Generation> {
         let (write, gen) = {
             let mut locked = self.state.lock().unwrap();
@@ -192,14 +192,9 @@ impl Persistence {
                 anyhow::bail!("Tried to increment generation of unknown shard");
             };
 
-            // If we're called with a None pageserver, we need only update the generation
-            // record to disassociate it with this pageserver, not actually increment the number, as
-            // the increment is guaranteed to happen the next time this tenant is attached.
-            if node_id.is_some() {
-                shard.generation += 1;
-            }
+            shard.generation += 1;
+            shard.generation_pageserver = Some(node_id);
 
-            shard.generation_pageserver = node_id;
             let gen = Generation::new(shard.generation);
             (locked.save(), gen)
         };
@@ -208,6 +203,19 @@ impl Persistence {
         Ok(gen)
     }
 
+    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        let write = {
+            let mut locked = self.state.lock().unwrap();
+            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+                anyhow::bail!("Tried to increment generation of unknown shard");
+            };
+            shard.generation_pageserver = None;
+            locked.save()
+        };
+        write.commit().await?;
+        Ok(())
+    }
+
     pub(crate) async fn re_attach(
         &self,
         node_id: NodeId,
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index b08339b3b4..d7f4c0406a 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -296,7 +296,7 @@ impl Reconciler {
         // Increment generation before attaching to new pageserver
         self.generation = self
             .persistence
-            .increment_generation(self.tenant_shard_id, Some(dest_ps_id))
+            .increment_generation(self.tenant_shard_id, dest_ps_id)
             .await?;
 
         let dest_conf = build_location_config(
@@ -395,7 +395,7 @@ impl Reconciler {
                     // as locations with unknown (None) observed state.
                     self.generation = self
                         .persistence
-                        .increment_generation(self.tenant_shard_id, Some(node_id))
+                        .increment_generation(self.tenant_shard_id, node_id)
                         .await?;
                     wanted_conf.generation = self.generation.into();
                     tracing::info!("Observed configuration requires update.");
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 5999d48fd9..5b44c097b1 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -362,13 +362,14 @@ impl Service {
             );
         }
 
-        let new_generation = if attach_req.node_id.is_some() {
+        let new_generation = if let Some(req_node_id) = attach_req.node_id {
             Some(
                 self.persistence
-                    .increment_generation(attach_req.tenant_shard_id, attach_req.node_id)
+                    .increment_generation(attach_req.tenant_shard_id, req_node_id)
                     .await?,
             )
         } else {
+            self.persistence.detach(attach_req.tenant_shard_id).await?;
             None
         };
 
@@ -407,6 +408,7 @@ impl Service {
             "attach_hook: tenant {} set generation {:?}, pageserver {}",
             attach_req.tenant_shard_id,
             tenant_state.generation,
+            // TODO: this is an odd number of 0xf's
             attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
         );
 

From b41ee81308734743497acf15e2e31cb59cd8a9fd Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 22 Jan 2024 21:38:05 +0300
Subject: [PATCH 07/70] Log warning on slow WAL removal (#6432)

Also add `safekeeper_active_timelines` metric.
Should help investigating #6403
---
 safekeeper/src/metrics.rs    | 18 +++++++++++++++++-
 safekeeper/src/remove_wal.rs | 15 +++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 8810252e0b..fbba2e00fc 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -110,7 +110,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
 pub static BACKED_UP_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "safekeeper_backed_up_segments_total",
-        "Number of WAL segments backed up to the broker"
+        "Number of WAL segments backed up to the S3"
     )
     .expect("Failed to register safekeeper_backed_up_segments_total counter")
 });
@@ -337,6 +337,7 @@ pub struct TimelineCollector {
     flushed_wal_seconds: GaugeVec,
     collect_timeline_metrics: Gauge,
     timelines_count: IntGauge,
+    active_timelines_count: IntGauge,
 }
 
 impl Default for TimelineCollector {
@@ -520,6 +521,13 @@ impl TimelineCollector {
         .unwrap();
         descs.extend(timelines_count.desc().into_iter().cloned());
 
+        let active_timelines_count = IntGauge::new(
+            "safekeeper_active_timelines",
+            "Total number of active timelines",
+        )
+        .unwrap();
+        descs.extend(active_timelines_count.desc().into_iter().cloned());
+
         TimelineCollector {
             descs,
             commit_lsn,
@@ -540,6 +548,7 @@ impl TimelineCollector {
             flushed_wal_seconds,
             collect_timeline_metrics,
             timelines_count,
+            active_timelines_count,
         }
     }
 }
@@ -572,6 +581,7 @@ impl Collector for TimelineCollector {
 
         let timelines = GlobalTimelines::get_all();
         let timelines_count = timelines.len();
+        let mut active_timelines_count = 0;
 
         // Prometheus Collector is sync, and data is stored under async lock. To
         // bridge the gap with a crutch, collect data in spawned thread with
@@ -590,6 +600,10 @@ impl Collector for TimelineCollector {
             let timeline_id = tli.ttid.timeline_id.to_string();
             let labels = &[tenant_id.as_str(), timeline_id.as_str()];
 
+            if tli.timeline_is_active {
+                active_timelines_count += 1;
+            }
+
             self.commit_lsn
                 .with_label_values(labels)
                 .set(tli.mem_state.commit_lsn.into());
@@ -681,6 +695,8 @@ impl Collector for TimelineCollector {
 
         // report total number of timelines
         self.timelines_count.set(timelines_count as i64);
+        self.active_timelines_count
+            .set(active_timelines_count as i64);
         mfs.extend(self.timelines_count.collect());
 
         mfs
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index d96eedf401..87c3ecaf60 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -10,11 +10,15 @@ use crate::{GlobalTimelines, SafeKeeperConf};
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
     let wal_removal_interval = Duration::from_millis(5000);
     loop {
+        let now = tokio::time::Instant::now();
+        let mut active_timelines = 0;
+
         let tlis = GlobalTimelines::get_all();
         for tli in &tlis {
             if !tli.is_active().await {
                 continue;
             }
+            active_timelines += 1;
             let ttid = tli.ttid;
             async {
                 if let Err(e) = tli.maybe_persist_control_file().await {
@@ -27,6 +31,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
             .instrument(info_span!("WAL removal", ttid = %ttid))
             .await;
         }
+
+        let elapsed = now.elapsed();
+        let total_timelines = tlis.len();
+
+        if elapsed > wal_removal_interval {
+            info!(
+                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
+                active_timelines, total_timelines, elapsed
+            );
+        }
+
         sleep(wal_removal_interval).await;
     }
 }

From f1901833a62c38b4e53703e52a9593e6334fc2a9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 22 Jan 2024 19:16:38 +0000
Subject: [PATCH 08/70] pageserver_api: migrate keyspace related functions from
 `pgdatadir_mapping` (#6406)

The idea is to achieve separation between keyspace layout definition
and operating on said keyspace. I've inlined all these function since
they're small and we don't use LTO in the storage release builds
at the moment.

Closes https://github.com/neondatabase/neon/issues/6347
---
 libs/pageserver_api/src/key.rs      | 384 +++++++++++++++++++++++++++-
 pageserver/src/page_service.rs      |   3 +-
 pageserver/src/pgdatadir_mapping.rs | 367 +-------------------------
 pageserver/src/tenant/timeline.rs   |   2 +-
 pageserver/src/walingest.rs         |   3 +-
 pageserver/src/walredo.rs           |   3 +-
 6 files changed, 394 insertions(+), 368 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 6a3679292e..852670af2c 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,9 +1,11 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::{fmt, ops::Range};
 
-use crate::reltag::{BlockNumber, RelTag};
+use crate::reltag::{BlockNumber, RelTag, SlruKind};
 
 /// Key used in the Repository kv-store.
 ///
@@ -143,12 +145,390 @@ impl Key {
     }
 }
 
+// Layout of the Key address space
+//
+// The Key struct, used to address the underlying key-value store, consists of
+// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
+// all the data and metadata keys into those 18 bytes.
+//
+// Principles for the mapping:
+//
+// - Things that are often accessed or modified together, should be close to
+//   each other in the key space. For example, if a relation is extended by one
+//   block, we create a new key-value pair for the block data, and update the
+//   relation size entry. Because of that, the RelSize key comes after all the
+//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
+//   to each other.
+//
+// The key space is divided into four major sections, identified by the first
+// byte, and the form a hierarchy:
+//
+// 00 Relation data and metadata
+//
+//   DbDir    () -> (dbnode, spcnode)
+//   Filenodemap
+//   RelDir   -> relnode forknum
+//       RelBlocks
+//       RelSize
+//
+// 01 SLRUs
+//
+//   SlruDir  kind
+//   SlruSegBlocks segno
+//   SlruSegSize
+//
+// 02 pg_twophase
+//
+// 03 misc
+//    Controlfile
+//    checkpoint
+//    pg_version
+//
+// 04 aux files
+//
+// Below is a full list of the keyspace allocation:
+//
+// DbDir:
+// 00 00000000 00000000 00000000 00   00000000
+//
+// Filenodemap:
+// 00 SPCNODE  DBNODE   00000000 00   00000000
+//
+// RelDir:
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
+//
+// RelBlock:
+// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
+//
+// RelSize:
+// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+//
+// SlruDir:
+// 01 kind     00000000 00000000 00   00000000
+//
+// SlruSegBlock:
+// 01 kind     00000001 SEGNO    00   BLKNUM
+//
+// SlruSegSize:
+// 01 kind     00000001 SEGNO    00   FFFFFFFF
+//
+// TwoPhaseDir:
+// 02 00000000 00000000 00000000 00   00000000
+//
+// TwoPhaseFile:
+// 02 00000000 00000000 00000000 00   XID
+//
+// ControlFile:
+// 03 00000000 00000000 00000000 00   00000000
+//
+// Checkpoint:
+// 03 00000000 00000000 00000000 00   00000001
+//
+// AuxFiles:
+// 03 00000000 00000000 00000000 00   00000002
+//
+
+//-- Section 01: relation data and metadata
+
+pub const DBDIR_KEY: Key = Key {
+    field1: 0x00,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0xffffffff,
+        field5: 0xff,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 1,
+    }
+}
+
+#[inline(always)]
+pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn rel_size_to_key(rel: RelTag) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn rel_key_range(rel: RelTag) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum + 1,
+        field6: 0,
+    }
+}
+
+//-- Section 02: SLRUs
+
+#[inline(always)]
+pub fn slru_dir_to_key(kind: SlruKind) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
+    let field2 = match kind {
+        SlruKind::Clog => 0x00,
+        SlruKind::MultiXactMembers => 0x01,
+        SlruKind::MultiXactOffsets => 0x02,
+    };
+
+    Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 1,
+        field6: 0,
+    }
+}
+
+//-- Section 03: pg_twophase
+
+pub const TWOPHASEDIR_KEY: Key = Key {
+    field1: 0x02,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn twophase_file_key(xid: TransactionId) -> Key {
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }
+}
+
+#[inline(always)]
+pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
+    let (next_xid, overflowed) = xid.overflowing_add(1);
+
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }..Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: u8::from(overflowed),
+        field6: next_xid,
+    }
+}
+
+//-- Section 03: Control file
+pub const CONTROLFILE_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+pub const CHECKPOINT_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 1,
+};
+
+pub const AUX_FILES_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 2,
+};
+
+// Reverse mappings for a few Keys.
+// These are needed by WAL redo manager.
+
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+#[inline(always)]
+pub fn is_inherited_key(key: Key) -> bool {
+    key != AUX_FILES_KEY
+}
+
+#[inline(always)]
+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+    Ok(match key.field1 {
+        0x01 => {
+            let kind = match key.field2 {
+                0x00 => SlruKind::Clog,
+                0x01 => SlruKind::MultiXactMembers,
+                0x02 => SlruKind::MultiXactOffsets,
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
+            };
+            let segno = key.field4;
+            let blknum = key.field6;
+
+            (kind, segno, blknum)
+        }
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}
+
+#[inline(always)]
+pub fn is_slru_block_key(key: Key) -> bool {
+    key.field1 == 0x01                // SLRU-related
+        && key.field3 == 0x00000001   // but not SlruDir
+        && key.field6 != 0xffffffff // and not SlruSegSize
+}
+
 #[inline(always)]
 pub fn is_rel_block_key(key: &Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }
 
 /// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+#[inline(always)]
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 77ce9981f0..d9dba137e4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::{rel_block_to_key, Version};
+use crate::pgdatadir_mapping::Version;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 
+use pageserver_api::key::rel_block_to_key;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d9cc85319e..ae182b8dc6 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,7 +13,12 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::is_rel_block_key;
+use pageserver_api::key::{
+    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
+    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
+    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+};
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -1535,366 +1540,6 @@ struct SlruSegmentDirectory {
 
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 
-// Layout of the Key address space
-//
-// The Key struct, used to address the underlying key-value store, consists of
-// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
-// all the data and metadata keys into those 18 bytes.
-//
-// Principles for the mapping:
-//
-// - Things that are often accessed or modified together, should be close to
-//   each other in the key space. For example, if a relation is extended by one
-//   block, we create a new key-value pair for the block data, and update the
-//   relation size entry. Because of that, the RelSize key comes after all the
-//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
-//   to each other.
-//
-// The key space is divided into four major sections, identified by the first
-// byte, and the form a hierarchy:
-//
-// 00 Relation data and metadata
-//
-//   DbDir    () -> (dbnode, spcnode)
-//   Filenodemap
-//   RelDir   -> relnode forknum
-//       RelBlocks
-//       RelSize
-//
-// 01 SLRUs
-//
-//   SlruDir  kind
-//   SlruSegBlocks segno
-//   SlruSegSize
-//
-// 02 pg_twophase
-//
-// 03 misc
-//    Controlfile
-//    checkpoint
-//    pg_version
-//
-// 04 aux files
-//
-// Below is a full list of the keyspace allocation:
-//
-// DbDir:
-// 00 00000000 00000000 00000000 00   00000000
-//
-// Filenodemap:
-// 00 SPCNODE  DBNODE   00000000 00   00000000
-//
-// RelDir:
-// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
-//
-// RelBlock:
-// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
-//
-// RelSize:
-// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
-//
-// SlruDir:
-// 01 kind     00000000 00000000 00   00000000
-//
-// SlruSegBlock:
-// 01 kind     00000001 SEGNO    00   BLKNUM
-//
-// SlruSegSize:
-// 01 kind     00000001 SEGNO    00   FFFFFFFF
-//
-// TwoPhaseDir:
-// 02 00000000 00000000 00000000 00   00000000
-//
-// TwoPhaseFile:
-// 02 00000000 00000000 00000000 00   XID
-//
-// ControlFile:
-// 03 00000000 00000000 00000000 00   00000000
-//
-// Checkpoint:
-// 03 00000000 00000000 00000000 00   00000001
-//
-// AuxFiles:
-// 03 00000000 00000000 00000000 00   00000002
-//
-
-//-- Section 01: relation data and metadata
-
-const DBDIR_KEY: Key = Key {
-    field1: 0x00,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0xffffffff,
-        field5: 0xff,
-        field6: 0xffffffff,
-    }
-}
-
-fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 1,
-    }
-}
-
-pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-
-fn rel_size_to_key(rel: RelTag) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0xffffffff,
-    }
-}
-
-fn rel_key_range(rel: RelTag) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum + 1,
-        field6: 0,
-    }
-}
-
-//-- Section 02: SLRUs
-
-fn slru_dir_to_key(kind: SlruKind) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: blknum,
-    }
-}
-
-fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0xffffffff,
-    }
-}
-
-fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
-    let field2 = match kind {
-        SlruKind::Clog => 0x00,
-        SlruKind::MultiXactMembers => 0x01,
-        SlruKind::MultiXactOffsets => 0x02,
-    };
-
-    Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 1,
-        field6: 0,
-    }
-}
-
-//-- Section 03: pg_twophase
-
-const TWOPHASEDIR_KEY: Key = Key {
-    field1: 0x02,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn twophase_file_key(xid: TransactionId) -> Key {
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }
-}
-
-fn twophase_key_range(xid: TransactionId) -> Range<Key> {
-    let (next_xid, overflowed) = xid.overflowing_add(1);
-
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }..Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: u8::from(overflowed),
-        field6: next_xid,
-    }
-}
-
-//-- Section 03: Control file
-const CONTROLFILE_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-const CHECKPOINT_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 1,
-};
-
-const AUX_FILES_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 2,
-};
-
-// Reverse mappings for a few Keys.
-// These are needed by WAL redo manager.
-
-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
-}
-
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
-
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
-
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
-
-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
-
-fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0cb7cf26f2..69d0d0b320 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -73,8 +73,8 @@ use crate::metrics::{
     TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
-use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::tenant::config::TenantConfOpt;
+use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index f2c35436db..3183608862 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -33,11 +33,12 @@ use utils::failpoint_support;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
-use crate::pgdatadir_mapping::*;
+use crate::pgdatadir_mapping::{DatadirModification, Version};
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index b4aadb2a8c..189d77d101 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -47,11 +47,10 @@ use crate::metrics::{
     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
     WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
-use crate::pgdatadir_mapping::key_to_slru_block;
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 
-use pageserver_api::key::key_to_rel_block;
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;

From 1bf8bb88c58a8737f887215ffebd8f6606e84010 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Fri, 8 Dec 2023 13:18:35 -0800
Subject: [PATCH 09/70] Add support for migrations within compute_ctl

---
 compute_tools/src/compute.rs |  5 +++++
 compute_tools/src/spec.rs    | 30 ++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5f5363105c..77ffe8da34 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -738,6 +738,7 @@ impl ComputeNode {
         handle_grants(spec, &mut client, self.connstr.as_str())?;
         handle_extensions(spec, &mut client)?;
         handle_extension_neon(&mut client)?;
+        handle_migrations(&mut client)?;
         create_availability_check_data(&mut client)?;
 
         // 'Close' connection
@@ -807,6 +808,10 @@ impl ComputeNode {
             handle_grants(&spec, &mut client, self.connstr.as_str())?;
             handle_extensions(&spec, &mut client)?;
             handle_extension_neon(&mut client)?;
+            // We can skip handle_migrations here because a new migration can only appear
+            // if we have a new version of the compute_ctl binary, which can only happen
+            // if compute got restarted, in which case we'll end up inside of apply_config
+            // instead of reconfigure.
         }
 
         // 'Close' connection
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ef5f55622d..09b8b19d10 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -727,3 +727,33 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
 
     Ok(())
 }
+
+#[instrument(skip_all)]
+pub fn handle_migrations(client: &mut Client) -> Result<()> {
+    info!("handle migrations");
+    let migrations = vec![""];
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration AUTHORIZATION cloud_admin";
+    client.simple_query(query)?;
+
+    query = "CREATE SEQUENCE IF NOT EXISTS neon_migration.migration_id START WITH 0 INCREMENT BY 1 MINVALUE 0";
+    client.simple_query(query)?;
+
+    query = "GRANT USAGE, SELECT ON SEQUENCE neon_migration.migration_id TO cloud_admin";
+    client.simple_query(query)?;
+
+    query = "SELECT last_value FROM neon_migration.migration_id";
+    let row = client.query_one(query, &[])?;
+    let mut current_migration: usize = row.get::<&str, i64>("last_value") as usize;
+
+    while current_migration < migrations.len() {
+        client.simple_query(migrations[current_migration])?;
+        current_migration += 1;
+    }
+    let setval = format!(
+        "SELECT setval('neon_migration.migration_id', {})",
+        migrations.len()
+    );
+    client.simple_query(&setval)?;
+    Ok(())
+}

From a40ed86d874a03a5346ab85921f55d45e58c7cfb Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 11 Dec 2023 12:06:29 -0800
Subject: [PATCH 10/70] Add test for migrations, add initial migration

---
 compute_tools/src/spec.rs           | 43 +++++++++++++++++++++++++++--
 pgxn/neon/control_plane_connector.c |  2 +-
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 09b8b19d10..005e3334b0 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -731,9 +731,43 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
 #[instrument(skip_all)]
 pub fn handle_migrations(client: &mut Client) -> Result<()> {
     info!("handle migrations");
-    let migrations = vec![""];
+    let migrations = vec![
+        "ALTER ROLE neon_superuser BYPASSRLS",
+        r#"
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
 
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration AUTHORIZATION cloud_admin";
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND
+            rolname NOT IN ('pg_read_all_data',
+                            'pg_write_all_data',
+                            'pg_read_all_settings',
+                            'pg_read_all_stats',
+                            'pg_stat_scan_tables',
+                            'pg_monitor',
+                            'pg_database_owner',
+                            'pg_signal_backend',
+                            'pg_read_server_files',
+                            'pg_write_server_files',
+                            'pg_execute_server_program',
+                            'pg_checkpoint',
+                            'pg_use_reserved_connections',
+                            'pg_create_subscription')
+    LOOP
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
+"#,
+    ];
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
     client.simple_query(query)?;
 
     query = "CREATE SEQUENCE IF NOT EXISTS neon_migration.migration_id START WITH 0 INCREMENT BY 1 MINVALUE 0";
@@ -745,6 +779,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
     query = "SELECT last_value FROM neon_migration.migration_id";
     let row = client.query_one(query, &[])?;
     let mut current_migration: usize = row.get::<&str, i64>("last_value") as usize;
+    let starting_migration_id = current_migration;
 
     while current_migration < migrations.len() {
         client.simple_query(migrations[current_migration])?;
@@ -755,5 +790,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         migrations.len()
     );
     client.simple_query(&setval)?;
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );
     Ok(())
 }
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index e467a9c43a..f6f006cba4 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -637,7 +637,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	ListCell   *option;
 	const char *role_name = stmt->role->rolename;
 
-	if (RoleIsNeonSuperuser(role_name))
+	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");
 
 	foreach(option, stmt->options)

From 3f90b2d33725beb25b26a0106b9539ffafa72537 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 12 Dec 2023 10:33:58 -0800
Subject: [PATCH 11/70] Fix test_ddl_forwarding

---
 test_runner/regress/test_ddl_forwarding.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 01aeb88bca..7174487e68 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -248,8 +248,15 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
     # We don't have compute_ctl, so here, so create neon_superuser here manually
     cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE")
 
-    with pytest.raises(psycopg2.InternalError):
-        cur.execute("ALTER ROLE neon_superuser LOGIN")
+    # Contrary to popular belief, being superman does not make you superuser
+    cur.execute("CREATE ROLE superman LOGIN NOSUPERUSER PASSWORD 'jungle_man'")
+
+    with ddl.pg.cursor(user="superman", password="jungle_man") as superman_cur:
+        # We allow real SUPERUSERs to ALTER neon_superuser
+        with pytest.raises(psycopg2.InternalError):
+            superman_cur.execute("ALTER ROLE neon_superuser LOGIN")
+
+    cur.execute("ALTER ROLE neon_superuser LOGIN")
 
     with pytest.raises(psycopg2.InternalError):
         cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser")

From 2eac1adcb9a559351c666e703848e14ebcdc459d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 12 Dec 2023 10:43:01 -0800
Subject: [PATCH 12/70] Make clippy happy

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 005e3334b0..bbb3d6b3a9 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -731,7 +731,7 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
 #[instrument(skip_all)]
 pub fn handle_migrations(client: &mut Client) -> Result<()> {
     info!("handle migrations");
-    let migrations = vec![
+    let migrations = [
         "ALTER ROLE neon_superuser BYPASSRLS",
         r#"
 DO $$

From a718287902b5796e5240c516bc21e1778f92fcae Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Thu, 14 Dec 2023 17:56:22 -0800
Subject: [PATCH 13/70] Make migrations happen on a separate thread

---
 compute_tools/src/compute.rs | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 77ffe8da34..b200720034 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -700,13 +700,14 @@ impl ComputeNode {
         // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
-        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
+        let connstr = self.connstr.clone();
+        let mut client = match Client::connect(connstr.as_str(), NoTls) {
             Err(e) => {
                 info!(
                     "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                     e
                 );
-                let mut zenith_admin_connstr = self.connstr.clone();
+                let mut zenith_admin_connstr = connstr.clone();
 
                 zenith_admin_connstr
                     .set_username("zenith_admin")
@@ -719,8 +720,8 @@ impl ComputeNode {
                 client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                 drop(client);
 
-                // reconnect with connsting with expected name
-                Client::connect(self.connstr.as_str(), NoTls)?
+                // reconnect with connstring with expected name
+                Client::connect(connstr.as_str(), NoTls)?
             }
             Ok(client) => client,
         };
@@ -734,16 +735,23 @@ impl ComputeNode {
         cleanup_instance(&mut client)?;
         handle_roles(spec, &mut client)?;
         handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, self.connstr.as_str())?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        handle_grants(spec, &mut client, connstr.as_str())?;
         handle_extensions(spec, &mut client)?;
         handle_extension_neon(&mut client)?;
-        handle_migrations(&mut client)?;
         create_availability_check_data(&mut client)?;
 
         // 'Close' connection
         drop(client);
 
+        /*
+        thread::spawn(
+            move ||
+            {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_migrations(&mut client)
+            });
+         */
         Ok(())
     }
 

From 394ef013d0f4913d7dbc8b22c4b95ff1f772a6bf Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Fri, 15 Dec 2023 12:25:34 -0800
Subject: [PATCH 14/70] Push the migrations test

---
 test_runner/regress/test_migrations.py | 27 ++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 test_runner/regress/test_migrations.py

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
new file mode 100644
index 0000000000..ab05ef1e86
--- /dev/null
+++ b/test_runner/regress/test_migrations.py
@@ -0,0 +1,27 @@
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_migrations(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_migrations", "empty")
+    endpoint = env.endpoints.create("test_migrations")
+    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.start()
+
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT last_value FROM neon_migration.migration_id")
+        migration_id = cur.fetchall()
+        assert migration_id[0][0] == 2
+
+    endpoint.stop()
+    endpoint.start()
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT last_value FROM neon_migration.migration_id")
+        migration_id = cur.fetchall()
+        assert migration_id[0][0] == 2
+
+    log_path = endpoint.endpoint_path() / "compute.log"
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        assert "INFO start_compute:apply_config:handle_migrations: Ran 2 migrations" in logs
+        assert "INFO start_compute:apply_config:handle_migrations: Ran 0 migrations" in logs

From 11a91eaf7bb3733ac00930c4abf2f3b6160787f8 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Fri, 15 Dec 2023 14:14:18 -0800
Subject: [PATCH 15/70] Uncomment the thread

---
 compute_tools/src/compute.rs | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index b200720034..4ea6bf9987 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -744,14 +744,10 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
-        /*
-        thread::spawn(
-            move ||
-            {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
-                handle_migrations(&mut client)
-            });
-         */
+        thread::spawn(move || {
+            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            handle_migrations(&mut client)
+        });
         Ok(())
     }
 

From 869acfe29bbc1e44ffd2a23512b463715dc1b285 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Wed, 3 Jan 2024 21:08:14 -0800
Subject: [PATCH 16/70] Make migrations transactional

---
 compute_tools/src/spec.rs | 41 ++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index bbb3d6b3a9..b61df7dd18 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -740,27 +740,15 @@ DECLARE
 BEGIN
     FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
     LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
         EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
     END LOOP;
 
     FOR role_name IN SELECT rolname FROM pg_roles
         WHERE
-            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND
-            rolname NOT IN ('pg_read_all_data',
-                            'pg_write_all_data',
-                            'pg_read_all_settings',
-                            'pg_read_all_stats',
-                            'pg_stat_scan_tables',
-                            'pg_monitor',
-                            'pg_database_owner',
-                            'pg_signal_backend',
-                            'pg_read_server_files',
-                            'pg_write_server_files',
-                            'pg_execute_server_program',
-                            'pg_checkpoint',
-                            'pg_use_reserved_connections',
-                            'pg_create_subscription')
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
     LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSLRS';
         EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
     END LOOP;
 END $$;
@@ -770,26 +758,39 @@ END $$;
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
     client.simple_query(query)?;
 
-    query = "CREATE SEQUENCE IF NOT EXISTS neon_migration.migration_id START WITH 0 INCREMENT BY 1 MINVALUE 0";
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id serial PRIMARY KEY, value integer NOT NULL DEFAULT 0)";
     client.simple_query(query)?;
 
-    query = "GRANT USAGE, SELECT ON SEQUENCE neon_migration.migration_id TO cloud_admin";
+    query = "INSERT INTO neon_migration.migration_id VALUES (0) ON CONFLICT DO NOTHING";
     client.simple_query(query)?;
 
-    query = "SELECT last_value FROM neon_migration.migration_id";
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    client.simple_query(query)?;
+
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    client.simple_query(query)?;
+
+    query = "SELECT id FROM neon_migration.migration_id";
     let row = client.query_one(query, &[])?;
-    let mut current_migration: usize = row.get::<&str, i64>("last_value") as usize;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
     let starting_migration_id = current_migration;
 
+    query = "BEGIN";
+    client.simple_query(query)?;
+
     while current_migration < migrations.len() {
         client.simple_query(migrations[current_migration])?;
         current_migration += 1;
     }
     let setval = format!(
-        "SELECT setval('neon_migration.migration_id', {})",
+        "UPDATE neon_migration.migration_id SET value={}",
         migrations.len()
     );
     client.simple_query(&setval)?;
+
+    query = "COMMIT";
+    client.simple_query(query)?;
+
     info!(
         "Ran {} migrations",
         (migrations.len() - starting_migration_id)

From 30064eb197e502e958e75dd25144a1412f25a2e1 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Wed, 3 Jan 2024 21:17:36 -0800
Subject: [PATCH 17/70] Add scary comment

---
 compute_tools/src/spec.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index b61df7dd18..0bf0ea2b9a 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -731,6 +731,11 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
 #[instrument(skip_all)]
 pub fn handle_migrations(client: &mut Client) -> Result<()> {
     info!("handle migrations");
+
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
     let migrations = [
         "ALTER ROLE neon_superuser BYPASSRLS",
         r#"

From 3c3b53f8adaae86c36e5879c2dcb27be91343342 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Thu, 4 Jan 2024 11:15:53 -0800
Subject: [PATCH 18/70] Update test

---
 test_runner/regress/test_migrations.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index ab05ef1e86..885f355f0c 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -9,19 +9,19 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.start()
 
     with endpoint.cursor() as cur:
-        cur.execute("SELECT last_value FROM neon_migration.migration_id")
+        cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 1
 
     endpoint.stop()
     endpoint.start()
     with endpoint.cursor() as cur:
-        cur.execute("SELECT last_value FROM neon_migration.migration_id")
+        cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 2
+        assert migration_id[0][0] == 1
 
     log_path = endpoint.endpoint_path() / "compute.log"
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO start_compute:apply_config:handle_migrations: Ran 2 migrations" in logs
+        assert "INFO start_compute:apply_config:handle_migrations: Ran 1 migrations" in logs
         assert "INFO start_compute:apply_config:handle_migrations: Ran 0 migrations" in logs

From 6d8df2579b9b23a678fb4f0c0872a61e828ed7bc Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Fri, 5 Jan 2024 00:29:54 -0800
Subject: [PATCH 19/70] Fix dumb thing

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 0bf0ea2b9a..87b9e0887e 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -763,7 +763,7 @@ END $$;
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
     client.simple_query(query)?;
 
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id serial PRIMARY KEY, value integer NOT NULL DEFAULT 0)";
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id integer NOT NULL DEFAULT 0)";
     client.simple_query(query)?;
 
     query = "INSERT INTO neon_migration.migration_id VALUES (0) ON CONFLICT DO NOTHING";

From 844303255a523e1e62bbbd36bfb816ae49800e59 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 8 Jan 2024 13:36:50 -0800
Subject: [PATCH 20/70] Cargo fmt

---
 compute_tools/src/spec.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 87b9e0887e..da28ff91ea 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -763,7 +763,8 @@ END $$;
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
     client.simple_query(query)?;
 
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id integer NOT NULL DEFAULT 0)";
+    query =
+        "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id integer NOT NULL DEFAULT 0)";
     client.simple_query(query)?;
 
     query = "INSERT INTO neon_migration.migration_id VALUES (0) ON CONFLICT DO NOTHING";

From b2e7249979c22360afc0f4069ccd891140ac3f6e Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 8 Jan 2024 14:39:51 -0800
Subject: [PATCH 21/70] Sleep

---
 test_runner/regress/test_migrations.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 885f355f0c..dc6d695e94 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -1,3 +1,5 @@
+import time
+
 from fixtures.neon_fixtures import NeonEnv
 
 
@@ -7,6 +9,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create("test_migrations")
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
+    time.sleep(1)  # Sleep to let migrations run
 
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
@@ -15,6 +18,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     endpoint.stop()
     endpoint.start()
+    time.sleep(1)  # Sleep to let migrations run
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()

From 65a98e425d81069029a7af4592cbbc00c4d8c4cb Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 9 Jan 2024 14:40:53 -0800
Subject: [PATCH 22/70] Switch to bigint

---
 compute_tools/src/spec.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index da28ff91ea..3f6b88897f 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -763,8 +763,7 @@ END $$;
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
     client.simple_query(query)?;
 
-    query =
-        "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id integer NOT NULL DEFAULT 0)";
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id bigint NOT NULL DEFAULT 0)";
     client.simple_query(query)?;
 
     query = "INSERT INTO neon_migration.migration_id VALUES (0) ON CONFLICT DO NOTHING";

From 585687d5637324b68ef8ecbbc0a067eeb4172f83 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 9 Jan 2024 15:12:09 -0800
Subject: [PATCH 23/70] Fix syntax error

---
 compute_tools/src/spec.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 3f6b88897f..b3b47cff4d 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -745,7 +745,7 @@ DECLARE
 BEGIN
     FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
     LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
         EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
     END LOOP;
 
@@ -753,7 +753,7 @@ BEGIN
         WHERE
             NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
     LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSLRS';
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
         EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
     END LOOP;
 END $$;

From 9f186b4d3e724482bd88bea9fe04ec1450d9c8cd Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 9 Jan 2024 16:01:10 -0800
Subject: [PATCH 24/70] Fix query

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index b3b47cff4d..a075edb8fc 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -788,7 +788,7 @@ END $$;
         current_migration += 1;
     }
     let setval = format!(
-        "UPDATE neon_migration.migration_id SET value={}",
+        "UPDATE neon_migration.migration_id SET id={}",
         migrations.len()
     );
     client.simple_query(&setval)?;

From 55aede2762134ef9d6df99b26c8c6fcd36cda2e0 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 9 Jan 2024 16:23:51 -0800
Subject: [PATCH 25/70] Prevnet duplicate insertions

---
 compute_tools/src/spec.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index a075edb8fc..23a63c4f95 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -763,10 +763,10 @@ END $$;
     let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
     client.simple_query(query)?;
 
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (id bigint NOT NULL DEFAULT 0)";
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
     client.simple_query(query)?;
 
-    query = "INSERT INTO neon_migration.migration_id VALUES (0) ON CONFLICT DO NOTHING";
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
     client.simple_query(query)?;
 
     query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";

From 27587e155d232259573e565b83aefb6d7ea1bfa0 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 9 Jan 2024 16:59:47 -0800
Subject: [PATCH 26/70] Fix test

---
 compute_tools/src/spec.rs              | 1 +
 test_runner/regress/test_migrations.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 23a63c4f95..e87dc0b732 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -784,6 +784,7 @@ END $$;
     client.simple_query(query)?;
 
     while current_migration < migrations.len() {
+        info!("Running migration:\n{}\n", migrations[current_migration]);
         client.simple_query(migrations[current_migration])?;
         current_migration += 1;
     }
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index dc6d695e94..44613db9e1 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -14,7 +14,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 1
+        assert migration_id[0][0] == 2
 
     endpoint.stop()
     endpoint.start()
@@ -22,7 +22,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     with endpoint.cursor() as cur:
         cur.execute("SELECT id FROM neon_migration.migration_id")
         migration_id = cur.fetchall()
-        assert migration_id[0][0] == 1
+        assert migration_id[0][0] == 2
 
     log_path = endpoint.endpoint_path() / "compute.log"
     with open(log_path, "r") as log_file:

From d90b2b99dfe33266305d3b10791386c78e2fab24 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 9 Jan 2024 18:26:59 -0800
Subject: [PATCH 27/70] Fix test again

---
 test_runner/regress/test_migrations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 44613db9e1..fc26d6a6db 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -27,5 +27,5 @@ def test_migrations(neon_simple_env: NeonEnv):
     log_path = endpoint.endpoint_path() / "compute.log"
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO start_compute:apply_config:handle_migrations: Ran 1 migrations" in logs
+        assert "INFO start_compute:apply_config:handle_migrations: Ran 2 migrations" in logs
         assert "INFO start_compute:apply_config:handle_migrations: Ran 0 migrations" in logs

From 55bfa91bd7bb293f24a4f7781b69186ddc6b5c98 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Tue, 9 Jan 2024 20:03:53 -0800
Subject: [PATCH 28/70] Fix test again again

---
 test_runner/regress/test_migrations.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index fc26d6a6db..994d52a5d0 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -6,9 +6,13 @@ from fixtures.neon_fixtures import NeonEnv
 def test_migrations(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_migrations", "empty")
+
     endpoint = env.endpoints.create("test_migrations")
+    log_path = endpoint.endpoint_path() / "compute.log"
+
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
+
     time.sleep(1)  # Sleep to let migrations run
 
     with endpoint.cursor() as cur:
@@ -16,6 +20,10 @@ def test_migrations(neon_simple_env: NeonEnv):
         migration_id = cur.fetchall()
         assert migration_id[0][0] == 2
 
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        assert "INFO start_compute:apply_config:handle_migrations: Ran 2 migrations" in logs
+
     endpoint.stop()
     endpoint.start()
     time.sleep(1)  # Sleep to let migrations run
@@ -24,8 +32,6 @@ def test_migrations(neon_simple_env: NeonEnv):
         migration_id = cur.fetchall()
         assert migration_id[0][0] == 2
 
-    log_path = endpoint.endpoint_path() / "compute.log"
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO start_compute:apply_config:handle_migrations: Ran 2 migrations" in logs
         assert "INFO start_compute:apply_config:handle_migrations: Ran 0 migrations" in logs

From 0a7e050144b008843dffc6882575c39a8a735647 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 11 Jan 2024 21:45:40 +0000
Subject: [PATCH 29/70] Fix test one last time

---
 test_runner/regress/test_migrations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 994d52a5d0..51079e243e 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -22,7 +22,7 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO start_compute:apply_config:handle_migrations: Ran 2 migrations" in logs
+        assert "INFO handle_migrations: Ran 2 migrations" in logs
 
     endpoint.stop()
     endpoint.start()
@@ -34,4 +34,4 @@ def test_migrations(neon_simple_env: NeonEnv):
 
     with open(log_path, "r") as log_file:
         logs = log_file.read()
-        assert "INFO start_compute:apply_config:handle_migrations: Ran 0 migrations" in logs
+        assert "INFO handle_migrations: Ran 0 migrations" in logs

From 71f495c7f7006495b2c7046926ccc88d4b1aa2af Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 18 Jan 2024 02:30:06 +0000
Subject: [PATCH 30/70] Gate it behind feature flags

---
 compute_tools/src/compute.rs           | 10 ++++++----
 control_plane/src/endpoint.rs          | 11 +++++++++--
 libs/compute_api/src/spec.rs           |  3 +++
 test_runner/fixtures/neon_fixtures.py  |  1 +
 test_runner/regress/test_migrations.py |  2 +-
 5 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 4ea6bf9987..07e0abe6ff 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -744,10 +744,12 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
-        thread::spawn(move || {
-            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client)
-        });
+        if self.has_feature(ComputeFeature::Migrations) {
+            thread::spawn(move || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_migrations(&mut client)
+            });
+        }
         Ok(())
     }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 43f8ea3b43..d3b0366d31 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
 
 use compute_api::responses::{ComputeState, ComputeStatus};
-use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
+use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
 
 // contents of a endpoint.json file
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -70,6 +70,7 @@ pub struct EndpointConf {
     http_port: u16,
     pg_version: u32,
     skip_pg_catalog_updates: bool,
+    features: Vec<ComputeFeature>,
 }
 
 //
@@ -140,6 +141,7 @@ impl ComputeControlPlane {
             // with this we basically test a case of waking up an idle compute, where
             // we also skip catalog updates in the cloud.
             skip_pg_catalog_updates: true,
+            features: vec![],
         });
 
         ep.create_endpoint_dir()?;
@@ -154,6 +156,7 @@ impl ComputeControlPlane {
                 pg_port,
                 pg_version,
                 skip_pg_catalog_updates: true,
+                features: vec![],
             })?,
         )?;
         std::fs::write(
@@ -215,6 +218,9 @@ pub struct Endpoint {
 
     // Optimizations
     skip_pg_catalog_updates: bool,
+
+    // Feature flags
+    features: Vec<ComputeFeature>,
 }
 
 impl Endpoint {
@@ -244,6 +250,7 @@ impl Endpoint {
             tenant_id: conf.tenant_id,
             pg_version: conf.pg_version,
             skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            features: conf.features,
         })
     }
 
@@ -519,7 +526,7 @@ impl Endpoint {
             skip_pg_catalog_updates: self.skip_pg_catalog_updates,
             format_version: 1.0,
             operation_uuid: None,
-            features: vec![],
+            features: self.features.clone(),
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 13ac18e0c5..5361d14004 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
+    /// Enable running migrations
+    Migrations,
+
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d98aedf4d0..fb1925216a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2914,6 +2914,7 @@ class Endpoint(PgProtocol):
 
         # Write it back updated
         with open(config_path, "w") as file:
+            log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     # Mock the extension part of spec passed from control plane for local testing
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 51079e243e..121fa91f66 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -10,7 +10,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint = env.endpoints.create("test_migrations")
     log_path = endpoint.endpoint_path() / "compute.log"
 
-    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
     endpoint.start()
 
     time.sleep(1)  # Sleep to let migrations run

From 00d9bf5b611202c3186231d615c65e1487b617da Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 23 Jan 2024 07:55:05 +0200
Subject: [PATCH 31/70] Implement lockless update of pageserver_connstring GUC
 in shared memory (#6314)

## Problem

There is "neon.pageserver_connstring" GUC with PGC_SIGHUP option,
allowing to change it using
pg_reload_conf(). It is used by control plane to update pageserver
connection string if page server is crashed,
relocated or new shards are added.
It is copied to shared memory because config can not be loaded during
query execution and we need to
reestablish connection to page server.

## Summary of changes

Copying connection string to shared memory is done by postmaster. And
other backends
should check update counter to determine of connection URL is changed
and connection needs to be reestablished.
We can not use standard Postgres LW-locks, because postmaster has proc
entry and so can not wait
on this primitive. This is why lockless access algorithm is implemented
using two atomic counters to enforce
consistent reading of connection string value from shared memory.


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/libpagestore.c                      | 80 ++++++++++++++-----
 .../regress/test_pageserver_reconnect.py      | 42 ++++++++++
 2 files changed, 103 insertions(+), 19 deletions(-)
 create mode 100644 test_runner/regress/test_pageserver_reconnect.py

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3a7c0f1bb6..894c9729f2 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -64,10 +64,26 @@ static int max_reconnect_attempts = 60;
 
 #define MAX_PAGESERVER_CONNSTRING_SIZE 256
 
+/*
+ * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
+ * allowing it to be changed using pg_reload_conf(). The control plane can
+ * update the connection string if the pageserver crashes, is relocated, or
+ * new shards are added. A copy of the current value of the GUC is kept in
+ * shared memory, updated by the postmaster, because regular backends don't
+ * reload the config during query execution, but we might need to re-establish
+ * the pageserver connection with the new connection string even in the middle
+ * of a query.
+ *
+ * The shared memory copy is protected by a lockless algorithm using two
+ * atomic counters. The counters allow a backend to quickly check if the value
+ * has changed since last access, and to detect and retry copying the value if
+ * the postmaster changes the value concurrently. (Postmaster doesn't have a
+ * PGPROC entry and therefore cannot use LWLocks.)
+ */
 typedef struct
 {
-	LWLockId	lock;
-	pg_atomic_uint64 update_counter;
+	pg_atomic_uint64 begin_update_counter;
+	pg_atomic_uint64 end_update_counter;
 	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 } PagestoreShmemState;
 
@@ -84,7 +100,7 @@ static bool pageserver_flush(void);
 static void pageserver_disconnect(void);
 
 static bool
-PagestoreShmemIsValid()
+PagestoreShmemIsValid(void)
 {
 	return pagestore_shared && UsedShmemSegAddr;
 }
@@ -98,31 +114,58 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (!PagestoreShmemIsValid())
+	/*
+	 * Only postmaster updates the copy in shared memory.
+	 */
+	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
 		return;
-	LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+
+	pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
+	pg_write_barrier();
 	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-	LWLockRelease(pagestore_shared->lock);
+	pg_write_barrier();
+	pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
 }
 
 static bool
-CheckConnstringUpdated()
+CheckConnstringUpdated(void)
 {
 	if (!PagestoreShmemIsValid())
 		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
 }
 
 static void
-ReloadConnstring()
+ReloadConnstring(void)
 {
+	uint64		begin_update_counter;
+	uint64		end_update_counter;
+
 	if (!PagestoreShmemIsValid())
 		return;
-	LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-	strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-	pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-	LWLockRelease(pagestore_shared->lock);
+
+	/*
+	 * Copy the current settnig from shared to local memory. Postmaster can
+	 * update the value concurrently, in which case we would copy a garbled
+	 * mix of the old and new values. We will detect it because the counter's
+	 * won't match, and retry. But it's important that we don't do anything
+	 * within the retry-loop that would depend on the string having valid
+	 * contents.
+	 */
+	do
+	{
+		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
+		pg_read_barrier();
+
+		strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
+		pg_read_barrier();
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
+
+	pagestore_local_counter = end_update_counter;
 }
 
 static bool
@@ -137,7 +180,7 @@ pageserver_connect(int elevel)
 	static TimestampTz last_connect_time = 0;
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
-        uint64_t us_since_last_connect;
+	uint64_t	us_since_last_connect;
 
 	Assert(!connected);
 
@@ -147,7 +190,7 @@ pageserver_connect(int elevel)
 	}
 
 	now = GetCurrentTimestamp();
-        us_since_last_connect = now - last_connect_time;
+	us_since_last_connect = now - last_connect_time;
 	if (us_since_last_connect < delay_us)
 	{
 		pg_usleep(delay_us - us_since_last_connect);
@@ -505,8 +548,8 @@ PagestoreShmemInit(void)
 									   &found);
 	if (!found)
 	{
-		pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-		pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -531,7 +574,6 @@ pagestore_shmem_request(void)
 #endif
 
 	RequestAddinShmemSpace(PagestoreShmemSize());
-	RequestNamedLWLockTranche("neon_libpagestore", 1);
 }
 
 static void
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
new file mode 100644
index 0000000000..aecfcdd262
--- /dev/null
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -0,0 +1,42 @@
+import threading
+import time
+from contextlib import closing
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, PgBin
+
+
+# Test updating neon.pageserver_connstring setting on the fly.
+#
+# This merely changes some whitespace in the connection string, so
+# this doesn't prove that the new string actually takes effect. But at
+# least the code gets exercised.
+def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pageserver_restarts")
+    endpoint = env.endpoints.create_start("test_pageserver_restarts")
+    n_reconnects = 1000
+    timeout = 0.01
+    scale = 10
+
+    def run_pgbench(connstr: str):
+        log.info(f"Start a pgbench workload on pg {connstr}")
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr])
+
+    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
+    thread.start()
+
+    with closing(endpoint.connect()) as con:
+        with con.cursor() as c:
+            c.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+            connstring = c.fetchall()[0][0]
+            for i in range(n_reconnects):
+                time.sleep(timeout)
+                c.execute(
+                    "alter system set neon.pageserver_connstring=%s",
+                    (connstring + (" " * (i % 2)),),
+                )
+                c.execute("select pg_reload_conf()")
+
+    thread.join()

From 72de1cb511a3e4fb4a49721c2900be4b0741cc32 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 23 Jan 2024 11:17:15 +0000
Subject: [PATCH 32/70] remove some duped deps (#6422)

## Problem

duplicated deps

## Summary of changes

little bit of fiddling with deps to reduce duplicates

needs consideration:
https://github.com/notify-rs/notify/blob/main/CHANGELOG.md#notify-600-2023-05-17
---
 Cargo.lock                     | 187 +++++++++++++--------------------
 Cargo.toml                     |  14 +--
 libs/utils/src/nonblock.rs     |   4 +-
 libs/utils/src/tcp_listener.rs |   3 +-
 pageserver/src/walredo.rs      |   5 +-
 workspace_hack/Cargo.toml      |   3 +-
 6 files changed, 86 insertions(+), 130 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 952034a16b..37dba60aee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5"
 
 [[package]]
 name = "addr2line"
-version = "0.19.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
 dependencies = [
  "gimli",
 ]
@@ -840,15 +840,15 @@ dependencies = [
 
 [[package]]
 name = "backtrace"
-version = "0.3.67"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
 dependencies = [
  "addr2line",
  "cc",
  "cfg-if",
  "libc",
- "miniz_oxide 0.6.2",
+ "miniz_oxide",
  "object",
  "rustc-demangle",
 ]
@@ -1215,7 +1215,7 @@ dependencies = [
  "flate2",
  "futures",
  "hyper",
- "nix 0.26.2",
+ "nix 0.27.1",
  "notify",
  "num_cpus",
  "opentelemetry",
@@ -1331,7 +1331,7 @@ dependencies = [
  "git-version",
  "hex",
  "hyper",
- "nix 0.26.2",
+ "nix 0.27.1",
  "once_cell",
  "pageserver_api",
  "pageserver_client",
@@ -1872,13 +1872,13 @@ dependencies = [
 
 [[package]]
 name = "filetime"
-version = "0.2.21"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153"
+checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
  "windows-sys 0.48.0",
 ]
 
@@ -1895,7 +1895,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
 dependencies = [
  "crc32fast",
- "miniz_oxide 0.7.1",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -2093,9 +2093,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.27.2"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "git-version"
@@ -2748,18 +2748,18 @@ checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
 [[package]]
 name = "memoffset"
-version = "0.7.1"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
 dependencies = [
  "autocfg",
 ]
 
 [[package]]
 name = "memoffset"
-version = "0.8.0"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
+checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
 dependencies = [
  "autocfg",
 ]
@@ -2797,15 +2797,6 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
-[[package]]
-name = "miniz_oxide"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
-dependencies = [
- "adler",
-]
-
 [[package]]
 name = "miniz_oxide"
 version = "0.7.1"
@@ -2865,16 +2856,14 @@ dependencies = [
 
 [[package]]
 name = "nix"
-version = "0.26.2"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "cfg-if",
  "libc",
- "memoffset 0.7.1",
- "pin-utils",
- "static_assertions",
+ "memoffset 0.9.0",
 ]
 
 [[package]]
@@ -2889,20 +2878,21 @@ dependencies = [
 
 [[package]]
 name = "notify"
-version = "5.2.0"
+version = "6.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486"
+checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "crossbeam-channel",
  "filetime",
  "fsevent-sys",
  "inotify 0.9.6",
  "kqueue",
  "libc",
+ "log",
  "mio",
  "walkdir",
- "windows-sys 0.45.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -3028,9 +3018,9 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.30.3"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
  "memchr",
 ]
@@ -3102,9 +3092,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
 dependencies = [
  "opentelemetry_api",
  "opentelemetry_sdk",
@@ -3112,9 +3102,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-http"
-version = "0.8.0"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
+checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3125,54 +3115,56 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-otlp"
-version = "0.12.0"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
+checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
 dependencies = [
  "async-trait",
- "futures",
- "futures-util",
+ "futures-core",
  "http",
- "opentelemetry",
  "opentelemetry-http",
  "opentelemetry-proto",
+ "opentelemetry-semantic-conventions",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
  "prost",
  "reqwest",
  "thiserror",
+ "tokio",
+ "tonic",
 ]
 
 [[package]]
 name = "opentelemetry-proto"
-version = "0.2.0"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
+checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
 dependencies = [
- "futures",
- "futures-util",
- "opentelemetry",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
  "prost",
- "tonic 0.8.3",
+ "tonic",
 ]
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.11.0"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
+checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
 dependencies = [
  "opentelemetry",
 ]
 
 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
 dependencies = [
- "fnv",
  "futures-channel",
  "futures-util",
  "indexmap 1.9.3",
+ "js-sys",
  "once_cell",
  "pin-project-lite",
  "thiserror",
@@ -3181,21 +3173,22 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
 dependencies = [
  "async-trait",
  "crossbeam-channel",
- "dashmap",
- "fnv",
  "futures-channel",
  "futures-executor",
  "futures-util",
  "once_cell",
  "opentelemetry_api",
+ "ordered-float 3.9.2",
  "percent-encoding",
  "rand 0.8.5",
+ "regex",
+ "serde_json",
  "thiserror",
  "tokio",
  "tokio-stream",
@@ -3210,6 +3203,15 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "ordered-float"
+version = "3.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "ordered-multimap"
 version = "0.7.1"
@@ -3325,7 +3327,7 @@ dependencies = [
  "itertools",
  "md5",
  "metrics",
- "nix 0.26.2",
+ "nix 0.27.1",
  "num-traits",
  "num_cpus",
  "once_cell",
@@ -4339,9 +4341,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.4.5"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
+checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5195,7 +5197,7 @@ dependencies = [
  "prost",
  "tokio",
  "tokio-stream",
- "tonic 0.9.2",
+ "tonic",
  "tonic-build",
  "tracing",
  "utils",
@@ -5415,7 +5417,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
 dependencies = [
  "byteorder",
  "integer-encoding",
- "ordered-float",
+ "ordered-float 2.10.1",
 ]
 
 [[package]]
@@ -5681,38 +5683,6 @@ dependencies = [
  "winnow",
 ]
 
-[[package]]
-name = "tonic"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64 0.13.1",
- "bytes",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost",
- "prost-derive",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
- "tracing-futures",
-]
-
 [[package]]
 name = "tonic"
 version = "0.9.2"
@@ -5856,16 +5826,6 @@ dependencies = [
  "tracing-subscriber",
 ]
 
-[[package]]
-name = "tracing-futures"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2"
-dependencies = [
- "pin-project",
- "tracing",
-]
-
 [[package]]
 name = "tracing-log"
 version = "0.1.3"
@@ -5879,9 +5839,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
 dependencies = [
  "once_cell",
  "opentelemetry",
@@ -6118,7 +6078,7 @@ dependencies = [
  "hyper",
  "jsonwebtoken",
  "metrics",
- "nix 0.26.2",
+ "nix 0.27.1",
  "once_cell",
  "pin-project-lite",
  "postgres_connection",
@@ -6626,10 +6586,8 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
- "dashmap",
  "either",
  "fail",
- "futures",
  "futures-channel",
  "futures-core",
  "futures-executor",
@@ -6674,6 +6632,7 @@ dependencies = [
  "tokio-util",
  "toml_datetime",
  "toml_edit",
+ "tonic",
  "tower",
  "tracing",
  "tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index 5d5d2f4a55..eefd1cb114 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -99,14 +99,14 @@ libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
 native-tls = "0.2"
-nix = "0.26"
-notify = "5.0.0"
+nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
+notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.19.0"
-opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.11.0"
+opentelemetry = "0.20.0"
+opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "49.0.0"
@@ -118,7 +118,7 @@ rand = "0.8"
 redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
+reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -162,7 +162,7 @@ toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.19.0"
+tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs
index 8b1fd71ae6..05e2e3af4c 100644
--- a/libs/utils/src/nonblock.rs
+++ b/libs/utils/src/nonblock.rs
@@ -5,10 +5,10 @@ use std::os::unix::io::RawFd;
 pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
     let bits = fcntl(fd, F_GETFL)?;
 
-    // Safety: If F_GETFL returns some unknown bits, they should be valid
+    // If F_GETFL returns some unknown bits, they should be valid
     // for passing back to F_SETFL, too. If we left them out, the F_SETFL
     // would effectively clear them, which is not what we want.
-    let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
+    let mut flags = OFlag::from_bits_retain(bits);
     flags |= OFlag::O_NONBLOCK;
 
     fcntl(fd, F_SETFL(flags))?;
diff --git a/libs/utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs
index 7666ad138c..6b35d3d63a 100644
--- a/libs/utils/src/tcp_listener.rs
+++ b/libs/utils/src/tcp_listener.rs
@@ -1,7 +1,6 @@
 use std::{
     io,
     net::{TcpListener, ToSocketAddrs},
-    os::unix::prelude::AsRawFd,
 };
 
 use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
@@ -10,7 +9,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
 pub fn bind<A: ToSocketAddrs>(addr: A) -> io::Result<TcpListener> {
     let listener = TcpListener::bind(addr)?;
 
-    setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?;
+    setsockopt(&listener, ReuseAddr, &true)?;
 
     Ok(listener)
 }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 189d77d101..cfb8052cf1 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -836,9 +836,8 @@ impl WalRedoProcess {
         let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
         let mut nwrite = 0usize;
 
-        let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)];
-
         while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
             let n = loop {
                 match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
                     Err(nix::errno::Errno::EINTR) => continue,
@@ -877,7 +876,6 @@ impl WalRedoProcess {
         // advancing processed responses number.
 
         let mut output = self.stdout.lock().unwrap();
-        let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)];
         let n_processed_responses = output.n_processed_responses;
         while n_processed_responses + output.pending_responses.len() <= request_no {
             // We expect the WAL redo process to respond with an 8k page image. We read it
@@ -885,6 +883,7 @@ impl WalRedoProcess {
             let mut resultbuf = vec![0; BLCKSZ.into()];
             let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
             while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
                 // We do two things simultaneously: reading response from stdout
                 // and forward any logging information that the child writes to its stderr to the page server's log.
                 let n = loop {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index dbd46054a4..b72e0f3c26 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,10 +29,8 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-dashmap = { version = "5", default-features = false, features = ["raw-api"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
-futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
@@ -74,6 +72,7 @@ tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 toml_edit = { version = "0.19", features = ["serde"] }
+tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }

From 1905f0bcedab1ad85cde80be10ebc97da7280e92 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 23 Jan 2024 13:15:05 +0100
Subject: [PATCH 33/70] proxy: store role not found in cache (#6439)

## Problem

There are a lot of responses with 404 role not found error, which are
not getting cached in proxy.

## Summary of changes

If there was returned an empty secret but with the project_id, store it
in cache.
---
 proxy/src/auth/backend.rs          | 11 +++-----
 proxy/src/cache/project_info.rs    | 40 +++++++++++++++++++++--------
 proxy/src/console/provider.rs      |  6 ++---
 proxy/src/console/provider/mock.rs | 10 +++-----
 proxy/src/console/provider/neon.rs | 41 +++++++++++++++++-------------
 5 files changed, 64 insertions(+), 44 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 34171d4d3f..a6164f7bfb 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -202,21 +202,18 @@ async fn auth_quirks(
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let maybe_secret = api.get_role_secret(ctx, &info).await?;
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
 
-    let cached_secret = maybe_secret.unwrap_or_else(|| {
+    let secret = cached_secret.value.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
         info!("authentication info not found, mocking it");
-        Cached::new_uncached(AuthSecret::Scram(scram::ServerSecret::mock(
-            &info.user,
-            rand::random(),
-        )))
+        AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
     });
     match authenticate_with_secret(
         ctx,
-        cached_secret.value.clone(),
+        secret,
         info,
         client,
         unauthenticated_password,
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 57d9e5289d..b04208556e 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -44,7 +44,7 @@ impl<T> From<T> for Entry<T> {
 
 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<SmolStr, Entry<AuthSecret>>,
+    secret: std::collections::HashMap<SmolStr, Entry<Option<AuthSecret>>>,
     allowed_ips: Option<Entry<Arc<Vec<SmolStr>>>>,
 }
 
@@ -60,7 +60,7 @@ impl EndpointInfo {
         role_name: &SmolStr,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
-    ) -> Option<(AuthSecret, bool)> {
+    ) -> Option<(Option<AuthSecret>, bool)> {
         if let Some(secret) = self.secret.get(role_name) {
             if valid_since < secret.created_at {
                 return Some((
@@ -169,7 +169,7 @@ impl ProjectInfoCacheImpl {
         &self,
         endpoint_id: &SmolStr,
         role_name: &SmolStr,
-    ) -> Option<Cached<&Self, AuthSecret>> {
+    ) -> Option<Cached<&Self, Option<AuthSecret>>> {
         let (valid_since, ignore_cache_since) = self.get_cache_times();
         let endpoint_info = self.cache.get(endpoint_id)?;
         let (value, ignore_cache) =
@@ -208,7 +208,7 @@ impl ProjectInfoCacheImpl {
         project_id: &SmolStr,
         endpoint_id: &SmolStr,
         role_name: &SmolStr,
-        secret: AuthSecret,
+        secret: Option<AuthSecret>,
     ) {
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
@@ -364,8 +364,11 @@ mod tests {
         let endpoint_id = "endpoint".into();
         let user1: SmolStr = "user1".into();
         let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = None;
         let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
@@ -380,7 +383,10 @@ mod tests {
 
         // Shouldn't add more than 2 roles.
         let user3: SmolStr = "user3".into();
-        let secret3 = AuthSecret::Scram(ServerSecret::mock(user3.as_str(), [3; 32]));
+        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user3.as_str(),
+            [3; 32],
+        )));
         cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
         assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
 
@@ -413,8 +419,14 @@ mod tests {
         let endpoint_id = "endpoint".into();
         let user1: SmolStr = "user1".into();
         let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
         let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
@@ -459,8 +471,14 @@ mod tests {
         let endpoint_id = "endpoint".into();
         let user1: SmolStr = "user1".into();
         let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
         let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.clone().disable_ttl();
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 178a7a2f4c..bbcddae86c 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -235,7 +235,7 @@ pub struct NodeInfo {
 
 pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
-pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, AuthSecret>;
+pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<SmolStr>>>;
 
 /// This will allocate per each call, but the http requests alone
@@ -249,7 +249,7 @@ pub trait Api {
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError>;
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
@@ -280,7 +280,7 @@ impl Api for ConsoleBackend {
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         use ConsoleBackend::*;
         match self {
             Console(api) => api.get_role_secret(ctx, user_info).await,
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index cc35a06708..49db96a613 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -150,12 +150,10 @@ impl super::Api for Api {
         &self,
         _ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, GetAuthInfoError> {
-        Ok(self
-            .do_get_auth_info(user_info)
-            .await?
-            .secret
-            .map(CachedRoleSecret::new_uncached))
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(user_info).await?.secret,
+        ))
     }
 
     async fn get_allowed_ips(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index e8e36815c7..5cee86a9b6 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -86,9 +86,14 @@ impl Api {
                 },
             };
 
-            let secret = scram::ServerSecret::parse(&body.role_secret)
-                .map(AuthSecret::Scram)
-                .ok_or(GetAuthInfoError::BadSecret)?;
+            let secret = if body.role_secret.is_empty() {
+                None
+            } else {
+                let secret = scram::ServerSecret::parse(&body.role_secret)
+                    .map(AuthSecret::Scram)
+                    .ok_or(GetAuthInfoError::BadSecret)?;
+                Some(secret)
+            };
             let allowed_ips = body
                 .allowed_ips
                 .into_iter()
@@ -97,7 +102,7 @@ impl Api {
                 .collect_vec();
             ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
             Ok(AuthInfo {
-                secret: Some(secret),
+                secret,
                 allowed_ips,
                 project_id: body.project_id.map(SmolStr::from),
             })
@@ -172,19 +177,20 @@ impl super::Api for Api {
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let ep = &user_info.endpoint;
         let user = &user_info.user;
         if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
-            return Ok(Some(role_secret));
+            return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            if let Some(secret) = &auth_info.secret {
-                self.caches
-                    .project_info
-                    .insert_role_secret(&project_id, ep, user, secret.clone())
-            }
+            self.caches.project_info.insert_role_secret(
+                &project_id,
+                ep,
+                user,
+                auth_info.secret.clone(),
+            );
             self.caches.project_info.insert_allowed_ips(
                 &project_id,
                 ep,
@@ -192,7 +198,7 @@ impl super::Api for Api {
             );
         }
         // When we just got a secret, we don't need to invalidate it.
-        Ok(auth_info.secret.map(Cached::new_uncached))
+        Ok(Cached::new_uncached(auth_info.secret))
     }
 
     async fn get_allowed_ips(
@@ -214,11 +220,12 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            if let Some(secret) = &auth_info.secret {
-                self.caches
-                    .project_info
-                    .insert_role_secret(&project_id, ep, user, secret.clone())
-            }
+            self.caches.project_info.insert_role_secret(
+                &project_id,
+                ep,
+                user,
+                auth_info.secret.clone(),
+            );
             self.caches
                 .project_info
                 .insert_allowed_ips(&project_id, ep, allowed_ips.clone());

From e03f8abba93c86b98abcbb2aedc2220c5e4e2e3b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 23 Jan 2024 13:25:01 +0000
Subject: [PATCH 34/70] eager parsing of ip addr (#6446)

## Problem

Parsing the IP address at check time is a little wasteful.

## Summary of changes

Parse the IP when we get it from cplane. Adding a `None` variant to
still allow malformed patterns
---
 proxy/src/auth.rs                  |  4 +-
 proxy/src/auth/backend.rs          |  4 +-
 proxy/src/auth/credentials.rs      | 85 ++++++++++++++++++------------
 proxy/src/cache/project_info.rs    | 25 ++++++---
 proxy/src/console/messages.rs      |  4 +-
 proxy/src/console/provider.rs      |  6 +--
 proxy/src/console/provider/mock.rs | 11 ++--
 proxy/src/console/provider/neon.rs |  8 +--
 proxy/src/proxy/tests.rs           |  4 +-
 9 files changed, 90 insertions(+), 61 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 0707c1331f..8d1b861a66 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -4,7 +4,9 @@ pub mod backend;
 pub use backend::BackendType;
 
 mod credentials;
-pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint};
+pub use credentials::{
+    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
+};
 
 mod password_hack;
 pub use password_hack::parse_endpoint_param;
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index a6164f7bfb..1e03510119 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -35,6 +35,8 @@ use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};
 
+use super::IpPattern;
+
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -55,7 +57,7 @@ pub enum BackendType<'a, T> {
 
 pub trait TestBackend: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
-    fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError>;
+    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, ()> {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index ada7f3614c..342fd6fce9 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -7,7 +7,7 @@ use crate::{
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
-use std::{collections::HashSet, net::IpAddr};
+use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
 
@@ -151,30 +151,51 @@ impl ComputeUserInfoMaybeEndpoint {
     }
 }
 
-pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<SmolStr>) -> bool {
-    if ip_list.is_empty() {
-        return true;
-    }
-    for ip in ip_list {
-        // We expect that all ip addresses from control plane are correct.
-        // However, if some of them are broken, we still can check the others.
-        match parse_ip_pattern(ip) {
-            Ok(pattern) => {
-                if check_ip(peer_addr, &pattern) {
-                    return true;
-                }
-            }
-            Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err),
-        }
-    }
-    false
+pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
+    ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
 }
 
 #[derive(Debug, Clone, Eq, PartialEq)]
-enum IpPattern {
+pub enum IpPattern {
     Subnet(ipnet::IpNet),
     Range(IpAddr, IpAddr),
     Single(IpAddr),
+    None,
+}
+
+impl<'de> serde::de::Deserialize<'de> for IpPattern {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct StrVisitor;
+        impl<'de> serde::de::Visitor<'de> for StrVisitor {
+            type Value = IpPattern;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(parse_ip_pattern(v).unwrap_or_else(|e| {
+                    warn!("Cannot parse ip pattern {v}: {e}");
+                    IpPattern::None
+                }))
+            }
+        }
+        deserializer.deserialize_str(StrVisitor)
+    }
+}
+
+impl FromStr for IpPattern {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        parse_ip_pattern(s)
+    }
 }
 
 fn parse_ip_pattern(pattern: &str) -> anyhow::Result<IpPattern> {
@@ -196,6 +217,7 @@ fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool {
         IpPattern::Subnet(subnet) => subnet.contains(ip),
         IpPattern::Range(start, end) => start <= ip && ip <= end,
         IpPattern::Single(addr) => addr == ip,
+        IpPattern::None => false,
     }
 }
 
@@ -206,6 +228,7 @@ fn project_name_valid(name: &str) -> bool {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use serde_json::json;
     use ComputeUserInfoParseError::*;
 
     #[test]
@@ -415,21 +438,17 @@ mod tests {
 
     #[test]
     fn test_check_peer_addr_is_in_list() {
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        assert!(check_peer_addr_is_in_list(&peer_addr, &vec![]));
-        assert!(check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["127.0.0.1".into()]
-        ));
-        assert!(!check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["8.8.8.8".into()]
-        ));
+        fn check(v: serde_json::Value) -> bool {
+            let peer_addr = IpAddr::from([127, 0, 0, 1]);
+            let ip_list: Vec<IpPattern> = serde_json::from_value(v).unwrap();
+            check_peer_addr_is_in_list(&peer_addr, &ip_list)
+        }
+
+        assert!(check(json!([])));
+        assert!(check(json!(["127.0.0.1"])));
+        assert!(!check(json!(["8.8.8.8"])));
         // If there is an incorrect address, it will be skipped.
-        assert!(check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["88.8.8".into(), "127.0.0.1".into()]
-        ));
+        assert!(check(json!(["88.8.8", "127.0.0.1"])));
     }
     #[test]
     fn test_parse_ip_v4() -> anyhow::Result<()> {
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index b04208556e..fa3d5d0e31 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -11,7 +11,7 @@ use smol_str::SmolStr;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
-use crate::{config::ProjectInfoCacheOptions, console::AuthSecret};
+use crate::{auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret};
 
 use super::{Cache, Cached};
 
@@ -45,7 +45,7 @@ impl<T> From<T> for Entry<T> {
 #[derive(Default)]
 struct EndpointInfo {
     secret: std::collections::HashMap<SmolStr, Entry<Option<AuthSecret>>>,
-    allowed_ips: Option<Entry<Arc<Vec<SmolStr>>>>,
+    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }
 
 impl EndpointInfo {
@@ -76,7 +76,7 @@ impl EndpointInfo {
         &self,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
-    ) -> Option<(Arc<Vec<SmolStr>>, bool)> {
+    ) -> Option<(Arc<Vec<IpPattern>>, bool)> {
         if let Some(allowed_ips) = &self.allowed_ips {
             if valid_since < allowed_ips.created_at {
                 return Some((
@@ -189,7 +189,7 @@ impl ProjectInfoCacheImpl {
     pub fn get_allowed_ips(
         &self,
         endpoint_id: &SmolStr,
-    ) -> Option<Cached<&Self, Arc<Vec<SmolStr>>>> {
+    ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
         let (valid_since, ignore_cache_since) = self.get_cache_times();
         let endpoint_info = self.cache.get(endpoint_id)?;
         let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
@@ -224,7 +224,7 @@ impl ProjectInfoCacheImpl {
         &self,
         project_id: &SmolStr,
         endpoint_id: &SmolStr,
-        allowed_ips: Arc<Vec<SmolStr>>,
+        allowed_ips: Arc<Vec<IpPattern>>,
     ) {
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
@@ -369,7 +369,10 @@ mod tests {
             [1; 32],
         )));
         let secret2 = None;
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
         cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
@@ -427,7 +430,10 @@ mod tests {
             user2.as_str(),
             [2; 32],
         )));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
         cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
@@ -479,7 +485,10 @@ mod tests {
             user2.as_str(),
             [2; 32],
         )));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.clone().disable_ttl();
         tokio::time::advance(Duration::from_millis(100)).await;
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index c02d65668f..1cfa2d6192 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -2,6 +2,8 @@ use serde::Deserialize;
 use smol_str::SmolStr;
 use std::fmt;
 
+use crate::auth::IpPattern;
+
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
@@ -14,7 +16,7 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
     pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<Box<str>>>,
+    pub allowed_ips: Option<Vec<IpPattern>>,
     pub project_id: Option<Box<str>>,
 }
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index bbcddae86c..53c394f52f 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -4,7 +4,7 @@ pub mod neon;
 
 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::backend::ComputeUserInfo,
+    auth::{backend::ComputeUserInfo, IpPattern},
     cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
@@ -212,7 +212,7 @@ pub enum AuthSecret {
 pub struct AuthInfo {
     pub secret: Option<AuthSecret>,
     /// List of IP addresses allowed for the autorization.
-    pub allowed_ips: Vec<SmolStr>,
+    pub allowed_ips: Vec<IpPattern>,
     /// Project ID. This is used for cache invalidation.
     pub project_id: Option<SmolStr>,
 }
@@ -236,7 +236,7 @@ pub struct NodeInfo {
 pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
-pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<SmolStr>>>;
+pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 49db96a613..55f395a403 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,14 +4,13 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
-use crate::cache::Cached;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
+use crate::{auth::IpPattern, cache::Cached};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use smol_str::SmolStr;
-use std::sync::Arc;
+use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
 use tokio_postgres::{config::SslMode, Client};
 use tracing::{error, info, info_span, warn, Instrument};
@@ -88,7 +87,9 @@ impl Api {
             {
                 Some(s) => {
                     info!("got allowed_ips: {s}");
-                    s.split(',').map(String::from).collect()
+                    s.split(',')
+                        .map(|s| IpPattern::from_str(s).unwrap())
+                        .collect()
                 }
                 None => vec![],
             };
@@ -100,7 +101,7 @@ impl Api {
         .await?;
         Ok(AuthInfo {
             secret,
-            allowed_ips: allowed_ips.iter().map(SmolStr::from).collect(),
+            allowed_ips,
             project_id: None,
         })
     }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 5cee86a9b6..6574e079d5 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,7 +14,6 @@ use crate::{
 };
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use itertools::Itertools;
 use smol_str::SmolStr;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -94,12 +93,7 @@ impl Api {
                     .ok_or(GetAuthInfoError::BadSecret)?;
                 Some(secret)
             };
-            let allowed_ips = body
-                .allowed_ips
-                .into_iter()
-                .flatten()
-                .map(SmolStr::from)
-                .collect_vec();
+            let allowed_ips = body.allowed_ips.unwrap_or_default();
             ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
             Ok(AuthInfo {
                 secret,
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 73fde2d7d0..a552a857b9 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -6,13 +6,13 @@ use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
+use crate::auth::IpPattern;
 use crate::config::CertResolver;
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
-use smol_str::SmolStr;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
@@ -471,7 +471,7 @@ impl TestBackend for TestConnectMechanism {
         }
     }
 
-    fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError> {
+    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError> {
         unimplemented!("not used in tests")
     }
 }

From 50288c16b16b220aba00e8ea99a3ac889cdbfb45 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 23 Jan 2024 15:11:32 +0100
Subject: [PATCH 35/70] fix(pagebench): avoid CopyFail error in success case
 (#6443)

PR #6392 fixed CopyFail in the case where we get cancelled.
But, we also want to use `client.shutdown()` if we don't get cancelled.
---
 pageserver/pagebench/src/cmd/getpage_latest_lsn.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 98f1852acd..400b5476b7 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -423,8 +423,8 @@ async fn client(
     tokio::select! {
         res = do_requests => { res },
         _ = cancel.cancelled() => {
-            client.shutdown().await;
-            return;
+            // fallthrough to shutdown
         }
     }
+    client.shutdown().await;
 }

From 37638fce792b5e2f1b70494e3f464e2730d6ec0f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Jan 2024 14:23:53 +0000
Subject: [PATCH 36/70] pageserver: introduce vectored Timeline::get interface
 (#6372)

1. Introduce a naive  `Timeline::get_vectored` implementation

The return type is intended to be flexible enough for various types of
callers. We return the pages in a map keyed by `Key` such that the
caller doesn't have to map back to the key if it needs to know it. Some
callers can ignore errors
for specific pages, so we return a separate `Result<Bytes,
PageReconstructError>` for each page and an overarching
`GetVectoredError` for API misuse. The overhead of the mapping will be
small and bounded since we enforce a maximum key count for the
operation.

2. Use the `get_vectored` API for SLRU segment reconstruction and image
layer creation.
---
 libs/pageserver_api/src/keyspace.rs |  45 ++++++
 libs/pageserver_api/src/reltag.rs   |  14 +-
 pageserver/src/basebackup.rs        | 146 ++++++++++++------
 pageserver/src/pgdatadir_mapping.rs |  28 ++++
 pageserver/src/tenant/layer_map.rs  |  49 +++---
 pageserver/src/tenant/timeline.rs   | 225 ++++++++++++++++++++++------
 6 files changed, 383 insertions(+), 124 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index cab7b3d860..2316acb616 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -104,6 +104,7 @@ pub struct KeySpaceAccum {
     accum: Option<Range<Key>>,
 
     ranges: Vec<Range<Key>>,
+    size: u64,
 }
 
 impl KeySpaceAccum {
@@ -111,6 +112,7 @@ impl KeySpaceAccum {
         Self {
             accum: None,
             ranges: Vec::new(),
+            size: 0,
         }
     }
 
@@ -121,6 +123,8 @@ impl KeySpaceAccum {
 
     #[inline(always)]
     pub fn add_range(&mut self, range: Range<Key>) {
+        self.size += key_range_size(&range) as u64;
+
         match self.accum.as_mut() {
             Some(accum) => {
                 if range.start == accum.end {
@@ -146,6 +150,23 @@ impl KeySpaceAccum {
             ranges: self.ranges,
         }
     }
+
+    pub fn consume_keyspace(&mut self) -> KeySpace {
+        if let Some(accum) = self.accum.take() {
+            self.ranges.push(accum);
+        }
+
+        let mut prev_accum = KeySpaceAccum::new();
+        std::mem::swap(self, &mut prev_accum);
+
+        KeySpace {
+            ranges: prev_accum.ranges,
+        }
+    }
+
+    pub fn size(&self) -> u64 {
+        self.size
+    }
 }
 
 ///
@@ -254,6 +275,30 @@ mod tests {
         }
     }
 
+    #[test]
+    fn keyspace_consume() {
+        let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
+
+        let mut accum = KeySpaceAccum::new();
+        for range in &ranges {
+            accum.add_range(range.clone());
+        }
+
+        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
+        assert_eq!(accum.size(), expected_size);
+
+        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
+        assert_eq!(accum.size(), 0);
+
+        assert_ks_eq(&accum.consume_keyspace(), vec![]);
+        assert_eq!(accum.size(), 0);
+
+        for range in &ranges {
+            accum.add_range(range.clone());
+        }
+        assert_ks_eq(&accum.to_keyspace(), ranges);
+    }
+
     #[test]
     fn keyspace_add_range() {
         // two separate ranges
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index e3a7da2ad9..3f37af600d 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -111,7 +111,19 @@ impl RelTag {
 /// These files are divided into segments, which are divided into
 /// pages of the same BLCKSZ as used for relation files.
 ///
-#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    Hash,
+    Serialize,
+    Deserialize,
+    PartialEq,
+    Eq,
+    PartialOrd,
+    Ord,
+    strum_macros::EnumIter,
+)]
 pub enum SlruKind {
     Clog,
     MultiXactMembers,
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 7e5ae892ad..781f4119c3 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -11,8 +11,9 @@
 //! from data stored in object storage.
 //!
 use anyhow::{anyhow, bail, ensure, Context};
-use bytes::{BufMut, BytesMut};
+use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
+use pageserver_api::key::Key;
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -23,7 +24,7 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};
 
 use crate::context::RequestContext;
-use crate::pgdatadir_mapping::Version;
+use crate::pgdatadir_mapping::{key_to_slru_block, Version};
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -133,6 +134,87 @@ where
     ctx: &'a RequestContext,
 }
 
+/// A sink that accepts SLRU blocks ordered by key and forwards
+/// full segments to the archive.
+struct SlruSegmentsBuilder<'a, 'b, W>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    ar: &'a mut Builder<&'b mut W>,
+    buf: Vec<u8>,
+    current_segment: Option<(SlruKind, u32)>,
+}
+
+impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
+        Self {
+            ar,
+            buf: Vec::new(),
+            current_segment: None,
+        }
+    }
+
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+        let (kind, segno, _) = key_to_slru_block(*key)?;
+
+        match kind {
+            SlruKind::Clog => {
+                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+            }
+            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
+                ensure!(block.len() == BLCKSZ as usize);
+            }
+        }
+
+        let segment = (kind, segno);
+        match self.current_segment {
+            None => {
+                self.current_segment = Some(segment);
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+            Some(current_seg) if current_seg == segment => {
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+            Some(_) => {
+                self.flush().await?;
+
+                self.current_segment = Some(segment);
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn flush(&mut self) -> anyhow::Result<()> {
+        let nblocks = self.buf.len() / BLCKSZ as usize;
+        let (kind, segno) = self.current_segment.take().unwrap();
+        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
+        let header = new_tar_header(&segname, self.buf.len() as u64)?;
+        self.ar.append(&header, self.buf.as_slice()).await?;
+
+        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+
+        self.buf.clear();
+
+        Ok(())
+    }
+
+    async fn finish(mut self) -> anyhow::Result<()> {
+        if self.current_segment.is_none() || self.buf.is_empty() {
+            return Ok(());
+        }
+
+        self.flush().await
+    }
+}
+
 impl<'a, W> Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
@@ -168,20 +250,27 @@ where
         }
 
         // Gather non-relational files from object storage pages.
-        for kind in [
-            SlruKind::Clog,
-            SlruKind::MultiXactOffsets,
-            SlruKind::MultiXactMembers,
-        ] {
-            for segno in self
+        let slru_partitions = self
+            .timeline
+            .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+            .await?
+            .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
+
+        let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
+
+        for part in slru_partitions.parts {
+            let blocks = self
                 .timeline
-                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
-                .await?
-            {
-                self.add_slru_segment(kind, segno).await?;
+                .get_vectored(&part.ranges, self.lsn, self.ctx)
+                .await?;
+
+            for (key, block) in blocks {
+                slru_builder.add_block(&key, block?).await?;
             }
         }
 
+        slru_builder.finish().await?;
+
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
         for ((spcnode, dbnode), has_relmap_file) in
@@ -305,39 +394,6 @@ where
         Ok(())
     }
 
-    //
-    // Generate SLRU segment files from repository.
-    //
-    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = self
-            .timeline
-            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
-            .await?;
-
-        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
-        for blknum in 0..nblocks {
-            let img = self
-                .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
-                .await?;
-
-            if slru == SlruKind::Clog {
-                ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
-            } else {
-                ensure!(img.len() == BLCKSZ as usize);
-            }
-
-            slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
-        }
-
-        let segname = format!("{}/{:>04X}", slru.to_str(), segno);
-        let header = new_tar_header(&segname, slru_buf.len() as u64)?;
-        self.ar.append(&header, slru_buf.as_slice()).await?;
-
-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
-        Ok(())
-    }
-
     //
     // Include database/tablespace directories.
     //
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ae182b8dc6..b65fe1eddd 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -27,6 +27,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
+use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
@@ -533,6 +534,33 @@ impl Timeline {
         Ok(Default::default())
     }
 
+    pub(crate) async fn get_slru_keyspace(
+        &self,
+        version: Version<'_>,
+        ctx: &RequestContext,
+    ) -> Result<KeySpace, PageReconstructError> {
+        let mut accum = KeySpaceAccum::new();
+
+        for kind in SlruKind::iter() {
+            let mut segments: Vec<u32> = self
+                .list_slru_segments(kind, version, ctx)
+                .await?
+                .into_iter()
+                .collect();
+            segments.sort_unstable();
+
+            for seg in segments {
+                let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
+
+                accum.add_range(
+                    slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
+                );
+            }
+        }
+
+        Ok(accum.to_keyspace())
+    }
+
     /// Get a list of SLRU segments
     pub(crate) async fn list_slru_segments(
         &self,
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 9b6225501f..c31d401e84 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -283,15 +283,15 @@ impl LayerMap {
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
-    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
+    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
         if key.is_empty() {
             // Vacuously true. There's a newer image for all 0 of the kerys in the range.
-            return Ok(true);
+            return true;
         }
 
         let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
             Some(v) => v,
-            None => return Ok(false),
+            None => return false,
         };
 
         let start = key.start.to_i128();
@@ -304,17 +304,17 @@ impl LayerMap {
 
         // Check the start is covered
         if !layer_covers(version.image_coverage.query(start)) {
-            return Ok(false);
+            return false;
         }
 
         // Check after all changes of coverage
         for (_, change_val) in version.image_coverage.range(start..end) {
             if !layer_covers(change_val) {
-                return Ok(false);
+                return false;
             }
         }
 
-        Ok(true)
+        true
     }
 
     pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
@@ -325,18 +325,14 @@ impl LayerMap {
     /// Divide the whole given range of keys into sub-ranges based on the latest
     /// image layer that covers each range at the specified lsn (inclusive).
     /// This is used when creating  new image layers.
-    ///
-    // FIXME: clippy complains that the result type is very complex. She's probably
-    // right...
-    #[allow(clippy::type_complexity)]
     pub fn image_coverage(
         &self,
         key_range: &Range<Key>,
         lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
+    ) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
         let version = match self.historic.get().unwrap().get_version(lsn.0) {
             Some(v) => v,
-            None => return Ok(vec![]),
+            None => return vec![],
         };
 
         let start = key_range.start.to_i128();
@@ -359,7 +355,7 @@ impl LayerMap {
         let kr = Key::from_i128(current_key)..Key::from_i128(end);
         coverage.push((kr, current_val.take()));
 
-        Ok(coverage)
+        coverage
     }
 
     pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
@@ -410,24 +406,19 @@ impl LayerMap {
     /// This number is used to compute the largest number of deltas that
     /// we'll need to visit for any page reconstruction in this region.
     /// We use this heuristic to decide whether to create an image layer.
-    pub fn count_deltas(
-        &self,
-        key: &Range<Key>,
-        lsn: &Range<Lsn>,
-        limit: Option<usize>,
-    ) -> Result<usize> {
+    pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
         // We get the delta coverage of the region, and for each part of the coverage
         // we recurse right underneath the delta. The recursion depth is limited by
         // the largest result this function could return, which is in practice between
         // 3 and 10 (since we usually try to create an image when the number gets larger).
 
         if lsn.is_empty() || key.is_empty() || limit == Some(0) {
-            return Ok(0);
+            return 0;
         }
 
         let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
             Some(v) => v,
-            None => return Ok(0),
+            None => return 0,
         };
 
         let start = key.start.to_i128();
@@ -448,8 +439,7 @@ impl LayerMap {
                     if !kr.is_empty() {
                         let base_count = Self::is_reimage_worthy(&val, key) as usize;
                         let new_limit = limit.map(|l| l - base_count);
-                        let max_stacked_deltas_underneath =
-                            self.count_deltas(&kr, &lr, new_limit)?;
+                        let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                         max_stacked_deltas = std::cmp::max(
                             max_stacked_deltas,
                             base_count + max_stacked_deltas_underneath,
@@ -471,7 +461,7 @@ impl LayerMap {
                 if !kr.is_empty() {
                     let base_count = Self::is_reimage_worthy(&val, key) as usize;
                     let new_limit = limit.map(|l| l - base_count);
-                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                     max_stacked_deltas = std::cmp::max(
                         max_stacked_deltas,
                         base_count + max_stacked_deltas_underneath,
@@ -480,7 +470,7 @@ impl LayerMap {
             }
         }
 
-        Ok(max_stacked_deltas)
+        max_stacked_deltas
     }
 
     /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
@@ -592,10 +582,7 @@ impl LayerMap {
                     if limit == Some(difficulty) {
                         break;
                     }
-                    for (img_range, last_img) in self
-                        .image_coverage(range, lsn)
-                        .expect("why would this err?")
-                    {
+                    for (img_range, last_img) in self.image_coverage(range, lsn) {
                         if limit == Some(difficulty) {
                             break;
                         }
@@ -606,9 +593,7 @@ impl LayerMap {
                         };
 
                         if img_lsn < lsn {
-                            let num_deltas = self
-                                .count_deltas(&img_range, &(img_lsn..lsn), limit)
-                                .expect("why would this err lol?");
+                            let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
                             difficulty = std::cmp::max(difficulty, num_deltas);
                         }
                     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 69d0d0b320..bac9bf6573 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::{
+    keyspace::{key_range_size, KeySpaceAccum},
     models::{
         DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
         LayerMapInfo, TimelineState,
@@ -32,7 +33,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::Gate;
 
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -404,6 +405,21 @@ pub(crate) enum PageReconstructError {
     WalRedo(anyhow::Error),
 }
 
+#[derive(thiserror::Error, Debug)]
+enum CreateImageLayersError {
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error(transparent)]
+    GetVectoredError(GetVectoredError),
+
+    #[error(transparent)]
+    PageReconstructError(PageReconstructError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 #[derive(thiserror::Error, Debug)]
 enum FlushLayerError {
     /// Timeline cancellation token was cancelled
@@ -411,12 +427,24 @@ enum FlushLayerError {
     Cancelled,
 
     #[error(transparent)]
-    PageReconstructError(#[from] PageReconstructError),
+    CreateImageLayersError(CreateImageLayersError),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GetVectoredError {
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
+    Oversized(u64),
+
+    #[error("Requested at invalid LSN: {0}")]
+    InvalidLsn(Lsn),
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
     Initial,
@@ -456,6 +484,45 @@ pub(crate) enum WaitLsnError {
     Timeout(String),
 }
 
+// The impls below achieve cancellation mapping for errors.
+// Perhaps there's a way of achieving this with less cruft.
+
+impl From<CreateImageLayersError> for CompactionError {
+    fn from(e: CreateImageLayersError) -> Self {
+        match e {
+            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            _ => CompactionError::Other(e.into()),
+        }
+    }
+}
+
+impl From<CreateImageLayersError> for FlushLayerError {
+    fn from(e: CreateImageLayersError) -> Self {
+        match e {
+            CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
+            any => FlushLayerError::CreateImageLayersError(any),
+        }
+    }
+}
+
+impl From<PageReconstructError> for CreateImageLayersError {
+    fn from(e: PageReconstructError) -> Self {
+        match e {
+            PageReconstructError::Cancelled => CreateImageLayersError::Cancelled,
+            _ => CreateImageLayersError::PageReconstructError(e),
+        }
+    }
+}
+
+impl From<GetVectoredError> for CreateImageLayersError {
+    fn from(e: GetVectoredError) -> Self {
+        match e {
+            GetVectoredError::Cancelled => CreateImageLayersError::Cancelled,
+            _ => CreateImageLayersError::GetVectoredError(e),
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -575,6 +642,53 @@ impl Timeline {
         res
     }
 
+    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+
+    /// Look up multiple page versions at a given LSN
+    ///
+    /// This naive implementation will be replaced with a more efficient one
+    /// which actually vectorizes the read path.
+    pub(crate) async fn get_vectored(
+        &self,
+        key_ranges: &[Range<Key>],
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if !lsn.is_valid() {
+            return Err(GetVectoredError::InvalidLsn(lsn));
+        }
+
+        let key_count = key_ranges
+            .iter()
+            .map(|range| key_range_size(range) as u64)
+            .sum();
+        if key_count > Timeline::MAX_GET_VECTORED_KEYS {
+            return Err(GetVectoredError::Oversized(key_count));
+        }
+
+        let mut values = BTreeMap::new();
+        for range in key_ranges {
+            let mut key = range.start;
+            while key != range.end {
+                assert!(!self.shard_identity.is_key_disposable(&key));
+
+                let block = self.get(key, lsn, ctx).await;
+
+                if matches!(
+                    block,
+                    Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+                ) {
+                    return Err(GetVectoredError::Cancelled);
+                }
+
+                values.insert(key, block);
+                key = key.next();
+            }
+        }
+
+        Ok(values)
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -2582,7 +2696,7 @@ impl Timeline {
                         return;
                     }
                     err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
+                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                     ) => {
                         error!("could not flush frozen layer: {err:?}");
                         break err;
@@ -2950,11 +3064,7 @@ impl Timeline {
     }
 
     // Is it time to create a new image layer for the given partition?
-    async fn time_for_new_image_layer(
-        &self,
-        partition: &KeySpace,
-        lsn: Lsn,
-    ) -> anyhow::Result<bool> {
+    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
@@ -2974,20 +3084,20 @@ impl Timeline {
                     // but the range is already covered by image layers at more recent LSNs. Before we
                     // create a new image layer, check if the range is already covered at more recent LSNs.
                     if !layers
-                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
+                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
                     {
                         debug!(
                             "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
                             img_range.start, img_range.end, cutoff_lsn, lsn
                         );
-                        return Ok(true);
+                        return true;
                     }
                 }
             }
         }
 
         for part_range in &partition.ranges {
-            let image_coverage = layers.image_coverage(part_range, lsn)?;
+            let image_coverage = layers.image_coverage(part_range, lsn);
             for (img_range, last_img) in image_coverage {
                 let img_lsn = if let Some(last_img) = last_img {
                     last_img.get_lsn_range().end
@@ -3008,7 +3118,7 @@ impl Timeline {
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
                     let num_deltas =
-                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold));
 
                     max_deltas = max_deltas.max(num_deltas);
                     if num_deltas >= threshold {
@@ -3016,7 +3126,7 @@ impl Timeline {
                             "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                             img_range.start, img_range.end, num_deltas, img_lsn, lsn
                         );
-                        return Ok(true);
+                        return true;
                     }
                 }
             }
@@ -3026,7 +3136,7 @@ impl Timeline {
             max_deltas,
             "none of the partitioned ranges had >= {threshold} deltas"
         );
-        Ok(false)
+        false
     }
 
     #[tracing::instrument(skip_all, fields(%lsn, %force))]
@@ -3036,7 +3146,7 @@ impl Timeline {
         lsn: Lsn,
         force: bool,
         ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, PageReconstructError> {
+    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers = Vec::new();
 
@@ -3054,7 +3164,7 @@ impl Timeline {
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
             start = img_range.end;
-            if force || self.time_for_new_image_layer(partition, lsn).await? {
+            if force || self.time_for_new_image_layer(partition, lsn).await {
                 let mut image_layer_writer = ImageLayerWriter::new(
                     self.conf,
                     self.timeline_id,
@@ -3065,10 +3175,12 @@ impl Timeline {
                 .await?;
 
                 fail_point!("image-layer-writer-fail-before-finish", |_| {
-                    Err(PageReconstructError::Other(anyhow::anyhow!(
+                    Err(CreateImageLayersError::Other(anyhow::anyhow!(
                         "failpoint image-layer-writer-fail-before-finish"
                     )))
                 });
+
+                let mut key_request_accum = KeySpaceAccum::new();
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
@@ -3081,34 +3193,55 @@ impl Timeline {
                             key = key.next();
                             continue;
                         }
-                        let img = match self.get(key, lsn, ctx).await {
-                            Ok(img) => img,
-                            Err(err) => {
-                                // If we fail to reconstruct a VM or FSM page, we can zero the
-                                // page without losing any actual user data. That seems better
-                                // than failing repeatedly and getting stuck.
-                                //
-                                // We had a bug at one point, where we truncated the FSM and VM
-                                // in the pageserver, but the Postgres didn't know about that
-                                // and continued to generate incremental WAL records for pages
-                                // that didn't exist in the pageserver. Trying to replay those
-                                // WAL records failed to find the previous image of the page.
-                                // This special case allows us to recover from that situation.
-                                // See https://github.com/neondatabase/neon/issues/2601.
-                                //
-                                // Unfortunately we cannot do this for the main fork, or for
-                                // any metadata keys, keys, as that would lead to actual data
-                                // loss.
-                                if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                                    warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                                    ZERO_PAGE.clone()
-                                } else {
-                                    return Err(err);
-                                }
-                            }
-                        };
 
-                        image_layer_writer.put_image(key, &img).await?;
+                        key_request_accum.add_key(key);
+                        if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                            || key.next() == range.end
+                        {
+                            let results = self
+                                .get_vectored(
+                                    &key_request_accum.consume_keyspace().ranges,
+                                    lsn,
+                                    ctx,
+                                )
+                                .await?;
+
+                            for (img_key, img) in results {
+                                let img = match img {
+                                    Ok(img) => img,
+                                    Err(err) => {
+                                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                                        // page without losing any actual user data. That seems better
+                                        // than failing repeatedly and getting stuck.
+                                        //
+                                        // We had a bug at one point, where we truncated the FSM and VM
+                                        // in the pageserver, but the Postgres didn't know about that
+                                        // and continued to generate incremental WAL records for pages
+                                        // that didn't exist in the pageserver. Trying to replay those
+                                        // WAL records failed to find the previous image of the page.
+                                        // This special case allows us to recover from that situation.
+                                        // See https://github.com/neondatabase/neon/issues/2601.
+                                        //
+                                        // Unfortunately we cannot do this for the main fork, or for
+                                        // any metadata keys, keys, as that would lead to actual data
+                                        // loss.
+                                        if is_rel_fsm_block_key(img_key)
+                                            || is_rel_vm_block_key(img_key)
+                                        {
+                                            warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                            ZERO_PAGE.clone()
+                                        } else {
+                                            return Err(
+                                                CreateImageLayersError::PageReconstructError(err),
+                                            );
+                                        }
+                                    }
+                                };
+
+                                image_layer_writer.put_image(img_key, &img).await?;
+                            }
+                        }
+
                         key = key.next();
                     }
                 }
@@ -3484,7 +3617,7 @@ impl Timeline {
                     // has not so much sense, because largest holes will corresponds field1/field2 changes.
                     // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
                     // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
                     if coverage_size >= min_hole_coverage_size {
                         heap.push(Hole {
                             key_range,
@@ -4110,7 +4243,7 @@ impl Timeline {
             // we cannot remove C, even though it's older than 2500, because
             // the delta layer 2000-3000 depends on it.
             if !layers
-                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
+                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
             {
                 debug!("keeping {} because it is the latest layer", l.filename());
                 // Collect delta key ranges that need image layers to allow garbage

From 42c17a6fc67c598390a3d4505d911f6280b932f8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 23 Jan 2024 17:21:06 +0100
Subject: [PATCH 37/70] attachment_service: use atomic overwrite to persist
 attachments.json (#6444)

The pagebench integration PR (#6214) is the first to SIGQUIT & then
restart attachment_service.

With many tenants (100), we have found frequent failures on restart in
the CI[^1].

[^1]:
[Allure](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6214/7615750160/index.html#suites/e26265675583c610f99af77084ae58f1/851ff709578c4452/)

```
2024-01-22T19:07:57.932021Z  INFO request{method=POST path=/attach-hook request_id=2697503c-7b3e-4529-b8c1-d12ef912d3eb}: Request handled, status: 200 OK
2024-01-22T19:07:58.898213Z  INFO Got SIGQUIT. Terminating
2024-01-22T19:08:02.176588Z  INFO version: git-env:d56f31639356ed8e8ce832097f132f27ee19ac8a, launch_timestamp: 2024-01-22 19:08:02.174634554 UTC, build_tag build_tag-env:7615750160, state at /tmp/test_output/test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]/repo/attachments.json, listening on 127.0.0.1:15048
thread 'main' panicked at /__w/neon/neon/control_plane/attachment_service/src/persistence.rs:95:17:
Failed to load state from '/tmp/test_output/test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]/repo/attachments.json': trailing characters at line 1 column 8957 (maybe your .neon/ dir was written by an older version?)
stack backtrace:
   0: rust_begin_unwind
             at /rustc/82e1608dfa6e0b5569232559e3d385fea5a93112/library/std/src/panicking.rs:645:5
   1: core::panicking::panic_fmt
             at /rustc/82e1608dfa6e0b5569232559e3d385fea5a93112/library/core/src/panicking.rs:72:14
   2: attachment_service::persistence::PersistentState::load_or_new::{{closure}}
             at ./control_plane/attachment_service/src/persistence.rs:95:17
   3: attachment_service::persistence::Persistence::new::{{closure}}
             at ./control_plane/attachment_service/src/persistence.rs:103:56
   4: attachment_service::main::{{closure}}
             at ./control_plane/attachment_service/src/main.rs:69:61
   5: tokio::runtime::park::CachedParkThread::block_on::{{closure}}
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/park.rs:282:63
   6: tokio::runtime::coop::with_budget
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/coop.rs:107:5
   7: tokio::runtime::coop::budget
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/coop.rs:73:5
   8: tokio::runtime::park::CachedParkThread::block_on
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/park.rs:282:31
   9: tokio::runtime::context::blocking::BlockingRegionGuard::block_on
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/context/blocking.rs:66:9
  10: tokio::runtime::scheduler::multi_thread::MultiThread::block_on::{{closure}}
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/scheduler/multi_thread/mod.rs:87:13
  11: tokio::runtime::context::runtime::enter_runtime
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/context/runtime.rs:65:16
  12: tokio::runtime::scheduler::multi_thread::MultiThread::block_on
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/scheduler/multi_thread/mod.rs:86:9
  13: tokio::runtime::runtime::Runtime::block_on
             at ./.cargo/registry/src/index.crates.io-6f17d22bba15001f/tokio-1.34.0/src/runtime/runtime.rs:350:50
  14: attachment_service::main
             at ./control_plane/attachment_service/src/main.rs:99:5
  15: core::ops::function::FnOnce::call_once
             at /rustc/82e1608dfa6e0b5569232559e3d385fea5a93112/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
```

The attachment_service handles SIGQUIT by just exiting the process.
In theory, the SIGQUIT could come in while we're writing out the
`attachments.json`.

Now, in above log output, there's a 1 second gap between the last
request completing
and the SIGQUIT coming in. So, there must be some other issue.

But, let's have this change anyways, maybe it helps uncover the real
cause for the test failure.
---
 .../attachment_service/src/persistence.rs     | 13 ++++--
 libs/utils/src/crashsafe.rs                   | 44 ++++++++++++++++++-
 pageserver/src/virtual_file.rs                |  7 +--
 3 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index db4f04d001..b9c79ff916 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,5 +1,6 @@
 use std::{collections::HashMap, str::FromStr};
 
+use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use control_plane::{
     attachment_service::{NodeAvailability, NodeSchedulingPolicy},
@@ -41,10 +42,14 @@ struct PendingWrite {
 }
 
 impl PendingWrite {
-    async fn commit(&self) -> anyhow::Result<()> {
-        tokio::fs::write(&self.path, &self.bytes).await?;
-
-        Ok(())
+    async fn commit(self) -> anyhow::Result<()> {
+        tokio::task::spawn_blocking(move || {
+            let tmp_path = utils::crashsafe::path_with_suffix_extension(&self.path, "___new");
+            utils::crashsafe::overwrite(&self.path, &tmp_path, &self.bytes)
+        })
+        .await
+        .context("spawn_blocking")?
+        .context("write file")
     }
 }
 
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index b089af4a02..0c6855d17b 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io,
+    io::{self, Write},
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -112,6 +112,48 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
     tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }
 
+/// Writes a file to the specified `final_path` in a crash safe fasion
+///
+/// The file is first written to the specified tmp_path, and in a second
+/// step, the tmp path is renamed to the final path. As renames are
+/// atomic, a crash during the write operation will never leave behind a
+/// partially written file.
+///
+/// NB: an async variant of this code exists in Pageserver's VirtualFile.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // before the rename, that's important!
+                // renames are atomic
+    std::fs::rename(tmp_path, final_path)?;
+    // Only open final path parent dirfd now, so that this operation only
+    // ever holds one VirtualFile fd at a time.  That's important because
+    // the current `find_victim_slot` impl might pick the same slot for both
+    // VirtualFile., and it eventually does a blocking write lock instead of
+    // try_lock.
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 9feefd8a32..06f58b5c52 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -356,12 +356,7 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Writes a file to the specified `final_path` in a crash safe fasion
-    ///
-    /// The file is first written to the specified tmp_path, and in a second
-    /// step, the tmp path is renamed to the final path. As renames are
-    /// atomic, a crash during the write operation will never leave behind a
-    /// partially written file.
+    /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
     pub async fn crashsafe_overwrite(
         final_path: &Utf8Path,
         tmp_path: &Utf8Path,

From 001f0d6db7c8e7fd9fb21735b9e42c318acca09b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 23 Jan 2024 17:07:01 +0000
Subject: [PATCH 38/70] pageserver: fix import failure caused by merge race
 (#6448)

PR #6406 raced with #6372 and broke main.
---
 pageserver/src/basebackup.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 781f4119c3..009deff0aa 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::Key;
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -24,7 +24,7 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};
 
 use crate::context::RequestContext;
-use crate::pgdatadir_mapping::{key_to_slru_block, Version};
+use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 

From faf275d4a254ab500d2ce1713c1e8c329ba613eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 23 Jan 2024 19:22:59 +0100
Subject: [PATCH 39/70] Remove initdb on timeline delete (#6387)

This PR:

* makes `initdb.tar.zst` be deleted by default on timeline deletion
(#6226), mirroring the safekeeper:
https://github.com/neondatabase/neon/pull/6381
* adds a new `preserve_initdb_archive` endpoint for a timeline, to be
used during the disaster recovery process, see reasoning
[here](https://github.com/neondatabase/neon/issues/6226#issuecomment-1894574778)
* makes the creation code look for `initdb-preserved.tar.zst` in
addition to `initdb.tar.zst`.
* makes the tests use the new endpoint

fixes #6226
---
 pageserver/src/http/openapi_spec.yml          | 50 +++++++++++++++++++
 pageserver/src/http/routes.rs                 | 41 +++++++++++++++
 .../src/tenant/remote_timeline_client.rs      | 48 ++++++++++++++++--
 .../tenant/remote_timeline_client/download.rs | 18 +++++--
 .../tenant/remote_timeline_client/upload.rs   | 17 ++++++-
 pageserver/src/tenant/timeline.rs             | 15 ++++++
 test_runner/fixtures/pageserver/http.py       | 11 ++++
 test_runner/regress/test_compatibility.py     | 12 ++++-
 test_runner/regress/test_wal_restore.py       |  3 ++
 9 files changed, 204 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 1fbca1086f..a49eef8bb9 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -877,6 +877,56 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        Marks the initdb archive for preservation upon deletion of the timeline or tenant.
+        This is meant to be part of the disaster recovery process.
+      responses:
+        "202":
+          description: Tenant scheduled to load successfully
+        "404":
+          description: No tenant or timeline found for the specified ids
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
 
   /v1/tenant/{tenant_id}/synthetic_size:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 811232397c..5e09a5aa1a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -561,6 +561,43 @@ async fn timeline_list_handler(
     json_response(StatusCode::OK, response_data)
 }
 
+async fn timeline_preserve_initdb_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    // Part of the process for disaster recovery from safekeeper-stored WAL:
+    // If we don't recover into a new timeline but want to keep the timeline ID,
+    // then the initdb archive is deleted. This endpoint copies it to a different
+    // location where timeline recreation cand find it.
+
+    async {
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(|e| ApiError::NotFound(e.into()))?;
+
+        timeline
+            .preserve_initdb_archive()
+            .await
+            .context("preserving initdb archive")
+            .map_err(ApiError::InternalServerError)?;
+
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_preserve_initdb_archive",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -1943,6 +1980,10 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_id/ignore", |r| {
             api_handler(r, tenant_ignore_handler)
         })
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
+            |r| api_handler(r, timeline_preserve_initdb_handler),
+        )
         .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
             api_handler(r, timeline_detail_handler)
         })
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1b5f861c90..80ff5c9a2d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -257,6 +257,8 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
 
+pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
+
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
@@ -1066,6 +1068,28 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    pub(crate) async fn preserve_initdb_archive(
+        self: &Arc<Self>,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        backoff::retry(
+            || async {
+                upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "preserve_initdb_tar_zst",
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
+        )
+        .await
+        .context("backing up initdb archive")?;
+        Ok(())
+    }
+
     /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
     /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
     /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1101,6 +1125,14 @@ impl RemoteTimelineClient {
         let layer_deletion_count = layers.len();
         self.deletion_queue_client.push_immediate(layers).await?;
 
+        // Delete the initdb.tar.zst, which is not always present, but deletion attempts of
+        // inexistant objects are not considered errors.
+        let initdb_path =
+            remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
+        self.deletion_queue_client
+            .push_immediate(vec![initdb_path])
+            .await?;
+
         // Do not delete index part yet, it is needed for possible retry. If we remove it first
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
@@ -1148,10 +1180,8 @@ impl RemoteTimelineClient {
                 if p == &latest_index {
                     return false;
                 }
-                if let Some(name) = p.object_name() {
-                    if name == INITDB_PATH {
-                        return false;
-                    }
+                if p.object_name() == Some(INITDB_PRESERVED_PATH) {
+                    return false;
                 }
                 true
             })
@@ -1724,6 +1754,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
     .expect("Failed to construct path")
 }
 
+pub fn remote_initdb_preserved_archive_path(
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+) -> RemotePath {
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}"
+    ))
+    .expect("Failed to construct path")
+}
+
 pub fn remote_index_path(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d3956163c8..4309c683e2 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -32,7 +32,8 @@ use utils::id::TimelineId;
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
     parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+    INITDB_PATH,
 };
 
 ///
@@ -430,6 +431,9 @@ pub(crate) async fn download_initdb_tar_zst(
 
     let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
 
+    let remote_preserved_path =
+        remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id);
+
     let timeline_path = conf.timelines_path(tenant_shard_id);
 
     if !timeline_path.exists() {
@@ -456,8 +460,16 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
+                .await
+            {
+                Ok(dl) => dl,
+                Err(DownloadError::NotFound) => {
+                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
+                        .await?
+                }
+                Err(other) => Err(other)?,
+            };
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
             let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 11c6956875..58d95f75c2 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -13,8 +13,8 @@ use super::Generation;
 use crate::{
     config::PageServerConf,
     tenant::remote_timeline_client::{
-        index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
-        upload_cancellable,
+        index::IndexPart, remote_index_path, remote_initdb_archive_path,
+        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
     },
 };
 use remote_storage::GenericRemoteStorage;
@@ -144,3 +144,16 @@ pub(crate) async fn upload_initdb_dir(
     .await
     .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
+
+pub(crate) async fn preserve_initdb_archive(
+    storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
+    let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
+    upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
+        .await
+        .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bac9bf6573..603ae3b83a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2973,6 +2973,21 @@ impl Timeline {
         Ok(())
     }
 
+    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
+        if let Some(remote_client) = &self.remote_client {
+            remote_client
+                .preserve_initdb_archive(
+                    &self.tenant_shard_id.tenant_id,
+                    &self.timeline_id,
+                    &self.cancel,
+                )
+                .await?;
+        } else {
+            bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
+        }
+        Ok(())
+    }
+
     // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
     // in layer map immediately. The caller is responsible to put it into the layer map.
     async fn create_delta_layer(
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index cfa2a2674d..ddf83b56a0 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -526,6 +526,17 @@ class PageserverHttpClient(requests.Session):
         res_json = res.json()
         assert res_json is None
 
+    def timeline_preserve_initdb_archive(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        log.info(
+            f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}"
+        )
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive",
+        )
+        self.verbose_error(res)
+
     def timeline_get_lsn_by_timestamp(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index f9d6d0a934..1a1425f069 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -7,11 +7,13 @@ from typing import List, Optional
 
 import pytest
 import toml
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
 )
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -269,14 +271,20 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     timeline_id = env.initial_timeline
     pg_version = env.pg_version
 
-    # Delete all files from local_fs_remote_storage except initdb.tar.zst,
+    try:
+        pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
+    except PageserverApiException as e:
+        # Allow the error as we might be running the old pageserver binary
+        log.info(f"Got allowed error: '{e}'")
+
+    # Delete all files from local_fs_remote_storage except initdb-preserved.tar.zst,
     # the file is required for `timeline_create` with `existing_initdb_timeline_id`.
     #
     # TODO: switch to Path.walk() in Python 3.12
     # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk():
     for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"):
         for filename in filenames:
-            if filename != "initdb.tar.zst":
+            if filename != "initdb-preserved.tar.zst" and filename != "initdb.tar.zst":
                 (Path(dirpath) / filename).unlink()
 
     timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 7d03f644d1..97db857c74 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -137,6 +137,9 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
 
     ps_client = env.pageserver.http_client()
 
+    # Mark the initdb archive for preservation
+    ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)
+
     # shut down the endpoint and delete the timeline from the pageserver
     endpoint.stop()
 

From 743f6dfb9be416748aed8f43f4ca294f71d9e849 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 23 Jan 2024 20:14:32 +0100
Subject: [PATCH 40/70] fix(attachment_service): corrupted attachments.json
 when parallel requests (#6450)

The pagebench integration PR (#6214) issues attachment requests in
parallel.
We observed corrupted attachments.json from time to time, especially in
the test cases with high tenant counts.

The atomic overwrite added in #6444 exposed the root cause cleanly:
the `.commit()` calls of two request handlers could interleave or
be reordered.
See also:
https://github.com/neondatabase/neon/pull/6444#issuecomment-1906392259

This PR makes changes to the `persistence` module to fix above race:
- mpsc queue for PendingWrites
- one writer task performs the writes in mpsc queue order
- request handlers that need to do writes do it using the
  new `mutating_transaction` function.

`mutating_transaction`, while holding the lock, does the modifications,
serializes the post-modification state, and pushes that as a
`PendingWrite` into the mpsc queue.
It then release the lock and `await`s the completion of the write.
The writer tasks executes the `PendingWrites` in queue order.
Once the write has been executed, it wakes the writing tokio task.
---
 Cargo.lock                                    |   1 +
 control_plane/attachment_service/Cargo.toml   |   1 +
 control_plane/attachment_service/src/main.rs  |   2 +-
 .../attachment_service/src/persistence.rs     | 153 ++++++++++--------
 4 files changed, 92 insertions(+), 65 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 37dba60aee..02a437ccf9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -286,6 +286,7 @@ dependencies = [
  "pageserver_client",
  "postgres_backend",
  "postgres_connection",
+ "scopeguard",
  "serde",
  "serde_json",
  "thiserror",
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 2e2286dbab..743dd806c4 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -14,6 +14,7 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
+scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index ee2a22ee53..38e51b9a9e 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -66,7 +66,7 @@ async fn main() -> anyhow::Result<()> {
         jwt_token: args.jwt_token,
     };
 
-    let persistence = Arc::new(Persistence::new(&args.path).await);
+    let persistence = Arc::new(Persistence::spawn(&args.path).await);
 
     let service = Service::spawn(config, persistence).await?;
 
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index b9c79ff916..4a3fc8c779 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,6 +1,5 @@
 use std::{collections::HashMap, str::FromStr};
 
-use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use control_plane::{
     attachment_service::{NodeAvailability, NodeSchedulingPolicy},
@@ -12,6 +11,7 @@ use pageserver_api::{
 };
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
+use tracing::info;
 use utils::{
     generation::Generation,
     id::{NodeId, TenantId},
@@ -21,50 +21,28 @@ use crate::{node::Node, PlacementPolicy};
 
 /// Placeholder for storage.  This will be replaced with a database client.
 pub struct Persistence {
-    state: std::sync::Mutex<PersistentState>,
+    inner: std::sync::Mutex<Inner>,
+}
+
+struct Inner {
+    state: PersistentState,
+    write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
 }
 
-// Top level state available to all HTTP handlers
 #[derive(Serialize, Deserialize)]
 struct PersistentState {
     tenants: HashMap<TenantShardId, TenantShardPersistence>,
-
-    #[serde(skip)]
-    path: Utf8PathBuf,
 }
 
-/// A convenience for serializing the state inside a sync lock, and then
-/// writing it to disk outside of the lock.  This will go away when switching
-/// to a database backend.
 struct PendingWrite {
     bytes: Vec<u8>,
-    path: Utf8PathBuf,
-}
-
-impl PendingWrite {
-    async fn commit(self) -> anyhow::Result<()> {
-        tokio::task::spawn_blocking(move || {
-            let tmp_path = utils::crashsafe::path_with_suffix_extension(&self.path, "___new");
-            utils::crashsafe::overwrite(&self.path, &tmp_path, &self.bytes)
-        })
-        .await
-        .context("spawn_blocking")?
-        .context("write file")
-    }
+    done_tx: tokio::sync::oneshot::Sender<()>,
 }
 
 impl PersistentState {
-    fn save(&self) -> PendingWrite {
-        PendingWrite {
-            bytes: serde_json::to_vec(self).expect("Serialization error"),
-            path: self.path.clone(),
-        }
-    }
-
     async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
         let bytes = tokio::fs::read(path).await?;
         let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-        decoded.path = path.to_owned();
 
         for (tenant_id, tenant) in &mut decoded.tenants {
             // Backward compat: an old attachments.json from before PR #6251, replace
@@ -93,7 +71,6 @@ impl PersistentState {
                 tracing::info!("Will create state file at {}", path);
                 Self {
                     tenants: HashMap::new(),
-                    path: path.to_owned(),
                 }
             }
             Err(e) => {
@@ -104,13 +81,74 @@ impl PersistentState {
 }
 
 impl Persistence {
-    pub async fn new(path: &Utf8Path) -> Self {
+    pub async fn spawn(path: &Utf8Path) -> Self {
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
         let state = PersistentState::load_or_new(path).await;
+        tokio::spawn(Self::writer_task(rx, path.to_owned()));
         Self {
-            state: std::sync::Mutex::new(state),
+            inner: std::sync::Mutex::new(Inner {
+                state,
+                write_queue_tx: tx,
+            }),
         }
     }
 
+    async fn writer_task(
+        mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
+        path: Utf8PathBuf,
+    ) {
+        scopeguard::defer! {
+            info!("persistence writer task exiting");
+        };
+        loop {
+            match rx.recv().await {
+                Some(write) => {
+                    tokio::task::spawn_blocking({
+                        let path = path.clone();
+                        move || {
+                            let tmp_path =
+                                utils::crashsafe::path_with_suffix_extension(&path, "___new");
+                            utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
+                        }
+                    })
+                    .await
+                    .expect("spawn_blocking")
+                    .expect("write file");
+                    let _ = write.done_tx.send(()); // receiver may lose interest any time
+                }
+                None => {
+                    return;
+                }
+            }
+        }
+    }
+
+    /// Perform a modification on our [`PersistentState`].
+    /// Return a future that completes once our modification has been persisted.
+    /// The output of the future is the return value of the `txn`` closure.
+    async fn mutating_transaction<F, R>(&self, txn: F) -> R
+    where
+        F: FnOnce(&mut PersistentState) -> R,
+    {
+        let (ret, done_rx) = {
+            let mut inner = self.inner.lock().unwrap();
+            let ret = txn(&mut inner.state);
+            let (done_tx, done_rx) = tokio::sync::oneshot::channel();
+            let write = PendingWrite {
+                bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
+                done_tx,
+            };
+            inner
+                .write_queue_tx
+                .send(write)
+                .expect("writer task always outlives self");
+            (ret, done_rx)
+        };
+        // the write task can go away once we start .await'ing
+        let _: () = done_rx.await.expect("writer task dead, check logs");
+        ret
+    }
+
     /// When registering a node, persist it so that on next start we will be able to
     /// iterate over known nodes to synchronize their tenant shard states with our observed state.
     pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
@@ -154,8 +192,8 @@ impl Persistence {
 
     /// At startup, we populate our map of tenant shards from persistent storage.
     pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
-        let locked = self.state.lock().unwrap();
-        Ok(locked.tenants.values().cloned().collect())
+        let inner = self.inner.lock().unwrap();
+        Ok(inner.state.tenants.values().cloned().collect())
     }
 
     /// Tenants must be persisted before we schedule them for the first time.  This enables us
@@ -164,8 +202,7 @@ impl Persistence {
         &self,
         shards: Vec<TenantShardPersistence>,
     ) -> anyhow::Result<()> {
-        let write = {
-            let mut locked = self.state.lock().unwrap();
+        self.mutating_transaction(|locked| {
             for shard in shards {
                 let tenant_shard_id = TenantShardId {
                     tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
@@ -175,12 +212,9 @@ impl Persistence {
 
                 locked.tenants.insert(tenant_shard_id, shard);
             }
-            locked.save()
-        };
-
-        write.commit().await?;
-
-        Ok(())
+            Ok(())
+        })
+        .await
     }
 
     /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
@@ -191,8 +225,7 @@ impl Persistence {
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
     ) -> anyhow::Result<Generation> {
-        let (write, gen) = {
-            let mut locked = self.state.lock().unwrap();
+        self.mutating_transaction(|locked| {
             let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
                 anyhow::bail!("Tried to increment generation of unknown shard");
             };
@@ -201,45 +234,37 @@ impl Persistence {
             shard.generation_pageserver = Some(node_id);
 
             let gen = Generation::new(shard.generation);
-            (locked.save(), gen)
-        };
-
-        write.commit().await?;
-        Ok(gen)
+            Ok(gen)
+        })
+        .await
     }
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        let write = {
-            let mut locked = self.state.lock().unwrap();
+        self.mutating_transaction(|locked| {
             let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
                 anyhow::bail!("Tried to increment generation of unknown shard");
             };
             shard.generation_pageserver = None;
-            locked.save()
-        };
-        write.commit().await?;
-        Ok(())
+            Ok(())
+        })
+        .await
     }
 
     pub(crate) async fn re_attach(
         &self,
         node_id: NodeId,
     ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
-        let (write, result) = {
+        self.mutating_transaction(|locked| {
             let mut result = HashMap::new();
-            let mut locked = self.state.lock().unwrap();
             for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
                 if shard.generation_pageserver == Some(node_id) {
                     shard.generation += 1;
                     result.insert(*tenant_shard_id, Generation::new(shard.generation));
                 }
             }
-
-            (locked.save(), result)
-        };
-
-        write.commit().await?;
-        Ok(result)
+            Ok(result)
+        })
+        .await
     }
 
     // TODO: when we start shard splitting, we must durably mark the tenant so that

From 4f51824820a7c06a0aabff6490a7223f9148bcec Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Wed, 24 Jan 2024 02:29:08 +0000
Subject: [PATCH 41/70] Fix creating publications for all tables

---
 test_runner/regress/test_neon_superuser.py | 34 ++++++++++++++++++++++
 vendor/postgres-v14                        |  2 +-
 vendor/postgres-v15                        |  2 +-
 vendor/postgres-v16                        |  2 +-
 vendor/revisions.json                      |  6 ++--
 5 files changed, 40 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_neon_superuser.py

diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
new file mode 100644
index 0000000000..6be7c114cb
--- /dev/null
+++ b/test_runner/regress/test_neon_superuser.py
@@ -0,0 +1,34 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion
+
+
+def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_neon_superuser", "empty")
+    endpoint = env.endpoints.create("test_neon_superuser")
+    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    endpoint.start()
+
+    time.sleep(1)  # Sleep to let migrations run
+
+    with endpoint.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+    with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
+        assert cur.fetchall()[0][0]
+        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
+        assert cur.fetchall()[0][0]
+
+        if pg_version == PgVersion.V16:
+            cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'set')")
+            assert cur.fetchall()[0][0]
+
+        cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
+        cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 8207291128..11e970fe2b 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 82072911287cabb32018cf92c8425fa1c744def4
+Subproject commit 11e970fe2be56804f0a786ec5fc8141ffefa4ca7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index c1c2272f43..731b4d1609 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit c1c2272f436ed9231f6172f49de219fe71a9280d
+Subproject commit 731b4d1609d6db1c953755810a41e0e67ea3db7b
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 7be4a52d72..cf302768b2 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 7be4a52d728459b79b59343c57d338c3073059c8
+Subproject commit cf302768b2890569956641e0e5ba112ae1445351
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3d626cb2bc..c7b33f8c8a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "7be4a52d728459b79b59343c57d338c3073059c8",
-    "postgres-v15": "c1c2272f436ed9231f6172f49de219fe71a9280d",
-    "postgres-v14": "82072911287cabb32018cf92c8425fa1c744def4"
+    "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
+    "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b",
+    "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7"
 }

From a72af29d12314fe132a91a0d2a8e1fe56fec7878 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 24 Jan 2024 11:49:30 +0000
Subject: [PATCH 42/70] control_plane/attachment_service: implement
 PlacementPolicy::Detached (#6458)

## Problem

The API for detaching things wasn't implement yet, but one could hit
this case indirectly from tests when using attach-hook, and find tenants
unexpectedly attached again because their policy remained Single.

## Summary of changes

Add PlacementPolicy::Detached, and:
- add the behavior for it in schedule()
- in tenant_migrate, refuse if the policy is detached
- automatically set this policy in attach-hook if the caller has
specified pageserver=null.
---
 control_plane/attachment_service/src/lib.rs          |  2 ++
 control_plane/attachment_service/src/persistence.rs  |  1 +
 control_plane/attachment_service/src/service.rs      | 12 +++++++++++-
 control_plane/attachment_service/src/tenant_state.rs | 12 ++++++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index d8f996952a..e4ca9aa304 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -17,6 +17,8 @@ enum PlacementPolicy {
     /// Production-ready way to attach a tenant: one attached pageserver and
     /// some number of secondaries.
     Double(usize),
+    /// Do not attach to any pageservers
+    Detached,
 }
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 4a3fc8c779..e944a2e9ed 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -245,6 +245,7 @@ impl Persistence {
                 anyhow::bail!("Tried to increment generation of unknown shard");
             };
             shard.generation_pageserver = None;
+            shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
             Ok(())
         })
         .await
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 5b44c097b1..c9ed07ae5f 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -381,6 +381,11 @@ impl Service {
 
         if let Some(new_generation) = new_generation {
             tenant_state.generation = new_generation;
+        } else {
+            // This is a detach notification.  We must update placement policy to avoid re-attaching
+            // during background scheduling/reconciliation, or during attachment service restart.
+            assert!(attach_req.node_id.is_none());
+            tenant_state.policy = PlacementPolicy::Detached;
         }
 
         if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
@@ -870,7 +875,6 @@ impl Service {
             } else {
                 let old_attached = shard.intent.attached;
 
-                shard.intent.attached = Some(migrate_req.node_id);
                 match shard.policy {
                     PlacementPolicy::Single => {
                         shard.intent.secondary.clear();
@@ -884,7 +888,13 @@ impl Service {
                             shard.intent.secondary.push(old_attached);
                         }
                     }
+                    PlacementPolicy::Detached => {
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
+                        )))
+                    }
                 }
+                shard.intent.attached = Some(migrate_req.node_id);
 
                 tracing::info!("Migrating: new intent {:?}", shard.intent);
                 shard.sequence = shard.sequence.next();
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index a907628eff..5290197d84 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -312,6 +312,18 @@ impl TenantState {
                     modified = true;
                 }
             }
+            Detached => {
+                // Should have no attached or secondary pageservers
+                if self.intent.attached.is_some() {
+                    self.intent.attached = None;
+                    modified = true;
+                }
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.secondary.clear();
+                    modified = true;
+                }
+            }
         }
 
         if modified {

From 996abc95632848c788ad12f2e3c9240fdd1ed9f4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 24 Jan 2024 12:51:53 +0100
Subject: [PATCH 43/70] pagebench-based GetPage@LSN performance test (#6214)

---
 pageserver/src/tenant.rs                      |   5 +-
 scripts/ps_ec2_setup_instance_store           |   3 +
 test_runner/fixtures/neon_fixtures.py         | 232 ++++++++++++++++--
 .../fixtures/pageserver/many_tenants.py       |  85 +++++++
 .../fixtures/pageserver/remote_storage.py     | 116 +++++++++
 test_runner/fixtures/pageserver/types.py      |   6 +-
 test_runner/fixtures/pageserver/utils.py      |  40 ++-
 test_runner/fixtures/utils.py                 |  33 +++
 test_runner/performance/pageserver/README.md  |  16 ++
 .../performance/pageserver/__init__.py        |   0
 .../pageserver/interactive/__init__.py        |   8 +
 .../interactive/test_many_small_tenants.py    |  79 ++++++
 .../pageserver/pagebench/__init__.py          |  10 +
 ...er_max_throughput_getpage_at_latest_lsn.py | 210 ++++++++++++++++
 test_runner/performance/pageserver/util.py    |  29 +++
 test_runner/regress/test_tenant_detach.py     |   6 +-
 16 files changed, 852 insertions(+), 26 deletions(-)
 create mode 100644 test_runner/fixtures/pageserver/many_tenants.py
 create mode 100644 test_runner/fixtures/pageserver/remote_storage.py
 create mode 100644 test_runner/performance/pageserver/README.md
 create mode 100644 test_runner/performance/pageserver/__init__.py
 create mode 100644 test_runner/performance/pageserver/interactive/__init__.py
 create mode 100644 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
 create mode 100644 test_runner/performance/pageserver/pagebench/__init__.py
 create mode 100644 test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
 create mode 100644 test_runner/performance/pageserver/util.py

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d9b91c9ce..d4d4208ac2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1017,7 +1017,10 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
-        failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
+        fail::fail_point!("attach-before-activate", |_| {
+            anyhow::bail!("attach-before-activate");
+        });
+        failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
 
         info!("Done");
 
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 00d37e5f83..4cca3a9857 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -39,6 +39,9 @@ SETUP COMPLETE
 To run your local neon.git build on the instance store volume,
 run the following commands from the top of the neon.git checkout
 
+    # raise file descriptor limit of your shell and its child processes
+    sudo prlimit -p $$ --nofile=800000:800000
+
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"
     DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fb1925216a..eef8de876e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -10,16 +10,18 @@ import shutil
 import subprocess
 import tempfile
 import textwrap
+import threading
 import time
 import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
+from fcntl import LOCK_EX, LOCK_UN, flock
 from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import urlparse
 
 import asyncpg
@@ -49,7 +51,10 @@ from fixtures.pageserver.allowed_errors import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
@@ -424,6 +429,7 @@ class NeonEnvBuilder:
         pg_distrib_dir: Path,
         pg_version: PgVersion,
         test_name: str,
+        top_output_dir: Path,
         test_output_dir: Path,
         test_overlay_dir: Optional[Path] = None,
         pageserver_remote_storage: Optional[RemoteStorage] = None,
@@ -473,6 +479,7 @@ class NeonEnvBuilder:
         self.test_overlay_dir = test_overlay_dir
         self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
         self.config_init_force: Optional[str] = None
+        self.top_output_dir = top_output_dir
 
         assert test_name.startswith(
             "test_"
@@ -526,6 +533,64 @@ class NeonEnvBuilder:
 
         return env
 
+    def build_and_use_snapshot(
+        self, global_ident: str, create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv]
+    ) -> NeonEnv:
+        if os.getenv("CI", "false") == "true":
+            log.info("do not use snapshots in ephemeral CI environment")
+            env = create_env_for_snapshot(self)
+            env.stop(immediate=True, ps_assert_metric_no_errors=False)
+            return env
+
+        with shared_snapshot_dir(self.top_output_dir, global_ident) as snapshot_dir:
+            if not snapshot_dir.is_initialized():
+                self._build_and_use_snapshot_impl(snapshot_dir, create_env_for_snapshot)
+                assert snapshot_dir.is_initialized()
+
+            return self.from_repo_dir(snapshot_dir.path)
+
+    def _build_and_use_snapshot_impl(
+        self,
+        snapshot_dir: SnapshotDirLocked,
+        create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv],
+    ):
+        if snapshot_dir.path.exists():
+            shutil.rmtree(snapshot_dir.path)
+
+        if self.test_overlay_dir is not None:
+            # Make repo_dir an overlayfs mount with lowerdir being the empty snapshot_dir.
+            # When we're done filling up repo_dir, tear everything down, unmount the overlayfs, and use
+            # the upperdir as the snapshot. This is equivalent to docker `FROM scratch`.
+            assert not self.repo_dir.exists()
+            assert self.repo_dir.parent.exists()
+            snapshot_dir.path.mkdir()
+            self.overlay_mount("create-snapshot-repo-dir", snapshot_dir.path, self.repo_dir)
+            self.config_init_force = "empty-dir-ok"
+
+        env = create_env_for_snapshot(self)
+        assert self.env is not None
+        assert self.env == env
+
+        # shut down everything for snapshot
+        env.stop(immediate=True, ps_assert_metric_no_errors=True)
+
+        # TODO: all kinds of assertions to ensure the env is unused
+
+        if self.test_overlay_dir is None:
+            log.info("take snapshot by moving repo dir")
+            env.repo_dir.rename(snapshot_dir.path)
+        else:
+            log.info("take snapshot by using overlayfs upperdir")
+            self.overlay_unmount_and_move("create-snapshot-repo-dir", snapshot_dir.path)
+            log.info("remove empty repo_dir (previously mountpoint) for snapshot overlay_mount")
+            env.repo_dir.rmdir()
+            # TODO from here on, we should be able to reset / goto top where snapshot_dir.is_initialized()
+            log.info("make repo_dir an overlayfs mount of the snapshot we just created")
+        assert not env.repo_dir.exists(), "both branches above should remove it"
+        snapshot_dir.set_initialized()
+
+        self.env = None  # so that from_repo_dir works again
+
     def from_repo_dir(
         self,
         repo_dir: Path,
@@ -557,10 +622,15 @@ class NeonEnvBuilder:
             tenants_from_dir = ps_dir / "tenants"
             tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"
 
-            log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
             if self.test_overlay_dir is None:
+                log.info(
+                    f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
+                )
                 shutil.copytree(tenants_from_dir, tenants_to_dir)
             else:
+                log.info(
+                    f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
+                )
                 self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir)
 
         for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"):
@@ -571,10 +641,12 @@ class NeonEnvBuilder:
 
         shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
         if self.test_overlay_dir is None:
+            log.info("Copying local_fs_remote_storage directory from snapshot")
             shutil.copytree(
                 repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
             )
         else:
+            log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot")
             self.overlay_mount(
                 "local_fs_remote_storage",
                 repo_dir / "local_fs_remote_storage",
@@ -631,6 +703,54 @@ class NeonEnvBuilder:
         )
         self.overlay_mounts_created_by_us.append((ident, dstdir))
 
+    def _overlay_umount(self, mountpoint: Path):
+        cmd = ["sudo", "umount", str(mountpoint)]
+        assert mountpoint.is_mount()
+        subprocess_capture(
+            self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+        )
+
+    def overlay_unmount_and_move(self, ident: str, dst: Path):
+        """
+        Unmount previously established overlayfs mount at `dstdir` and move the upperdir contents to `dst`.
+        If `dst` is an empty directory, it gets replaced.
+        Caller is responsible for ensuring the unmount will succeed, i.e., that there aren't any nested mounts.
+
+        Raises exception if self.test_overlay_dir is None
+        """
+        assert self.test_overlay_dir is not None
+        # not mutating state yet, make checks
+        ident_state_dir = self.test_overlay_dir / ident
+        assert ident_state_dir.is_dir()
+        upper = ident_state_dir / "upper"
+        work = ident_state_dir / "work"
+        assert upper.is_dir()
+        assert work.is_dir()
+        assert (
+            self.test_overlay_dir not in dst.parents
+        ), "otherwise workdir cleanup below wouldn't work"
+        # find index, still not mutating state
+        idxmap = {
+            existing_ident: idx
+            for idx, (existing_ident, _) in enumerate(self.overlay_mounts_created_by_us)
+        }
+        idx = idxmap.get(ident)
+        if idx is None:
+            raise RuntimeError(f"cannot find mount for ident {ident}")
+
+        if dst.is_dir():
+            dst.rmdir()  # raises exception if not empty, which is what we want
+
+        _, mountpoint = self.overlay_mounts_created_by_us.pop(idx)
+        self._overlay_umount(mountpoint)
+        upper.rename(dst)
+        # we moved the upperdir, clean up workdir and then its parent ident_state_dir
+        cmd = ["sudo", "rm", "-rf", str(work)]
+        subprocess_capture(
+            self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+        )
+        ident_state_dir.rmdir()  # should be empty since we moved `upper` out
+
     def overlay_cleanup_teardown(self):
         """
         Unmount the overlayfs mounts created by `self.overlay_mount()`.
@@ -641,13 +761,10 @@ class NeonEnvBuilder:
         while len(self.overlay_mounts_created_by_us) > 0:
             (ident, mountpoint) = self.overlay_mounts_created_by_us.pop()
             ident_state_dir = self.test_overlay_dir / ident
-            cmd = ["sudo", "umount", str(mountpoint)]
             log.info(
-                f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}"
-            )
-            subprocess_capture(
-                self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+                f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}"
             )
+            self._overlay_umount(mountpoint)
             log.info(
                 f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}"
             )
@@ -725,8 +842,15 @@ class NeonEnvBuilder:
         if self.preserve_database_files:
             return
 
+        overlayfs_mounts = {mountpoint for _, mountpoint in self.overlay_mounts_created_by_us}
+
         directories_to_clean: List[Path] = []
         for test_entry in Path(self.repo_dir).glob("**/*"):
+            if test_entry in overlayfs_mounts:
+                continue
+            for parent in test_entry.parents:
+                if parent in overlayfs_mounts:
+                    continue
             if test_entry.is_file():
                 test_file = test_entry
                 if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name):
@@ -775,13 +899,6 @@ class NeonEnvBuilder:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
 
-            try:
-                self.overlay_cleanup_teardown()
-            except Exception as e:
-                log.error(f"Error cleaning up overlay state: {e}")
-                if cleanup_error is not None:
-                    cleanup_error = e
-
             try:
                 self.cleanup_remote_storage()
             except Exception as e:
@@ -802,6 +919,13 @@ class NeonEnvBuilder:
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+        try:
+            self.overlay_cleanup_teardown()
+        except Exception as e:
+            log.error(f"Error cleaning up overlay state: {e}")
+            if cleanup_error is not None:
+                cleanup_error = e
+
 
 class NeonEnv:
     """
@@ -1082,6 +1206,7 @@ def _shared_simple_env(
         shutil.rmtree(repo_dir, ignore_errors=True)
 
     with NeonEnvBuilder(
+        top_output_dir=top_output_dir,
         repo_dir=repo_dir,
         port_distributor=port_distributor,
         broker=default_broker,
@@ -1130,6 +1255,7 @@ def neon_env_builder(
     run_id: uuid.UUID,
     request: FixtureRequest,
     test_overlay_dir: Path,
+    top_output_dir: Path,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1149,6 +1275,7 @@ def neon_env_builder(
 
     # Return the builder to the caller
     with NeonEnvBuilder(
+        top_output_dir=top_output_dir,
         repo_dir=Path(repo_dir),
         port_distributor=port_distributor,
         mock_s3_server=mock_s3_server,
@@ -3487,6 +3614,10 @@ def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return _get_test_dir(request, top_output_dir, "overlay-")
 
 
+def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path:
+    return top_output_dir / "shared-snapshots" / snapshot_name
+
+
 def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return get_test_output_dir(request, top_output_dir) / "repo"
 
@@ -3533,6 +3664,75 @@ def test_output_dir(
     allure_attach_from_dir(test_dir)
 
 
+class FileAndThreadLock:
+    def __init__(self, path: Path):
+        self.path = path
+        self.thread_lock = threading.Lock()
+        self.fd: Optional[int] = None
+
+    def __enter__(self):
+        self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY)
+        # lock thread lock before file lock so that there's no race
+        # around flocking / funlocking the file lock
+        self.thread_lock.acquire()
+        flock(self.fd, LOCK_EX)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        assert self.fd is not None
+        assert self.thread_lock.locked()  # ... by us
+        flock(self.fd, LOCK_UN)
+        self.thread_lock.release()
+        os.close(self.fd)
+        self.fd = None
+
+
+class SnapshotDirLocked:
+    def __init__(self, parent: SnapshotDir):
+        self._parent = parent
+
+    def is_initialized(self):
+        # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized.
+        # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed.
+        return self._parent._marker_file_path.exists()
+
+    def set_initialized(self):
+        self._parent._marker_file_path.write_text("")
+
+    @property
+    def path(self) -> Path:
+        return self._parent._path / "snapshot"
+
+
+class SnapshotDir:
+    _path: Path
+
+    def __init__(self, path: Path):
+        self._path = path
+        assert self._path.is_dir()
+        self._lock = FileAndThreadLock(self._lock_file_path)
+
+    @property
+    def _lock_file_path(self) -> Path:
+        return self._path / "initializing.flock"
+
+    @property
+    def _marker_file_path(self) -> Path:
+        return self._path / "initialized.marker"
+
+    def __enter__(self) -> SnapshotDirLocked:
+        self._lock.__enter__()
+        return SnapshotDirLocked(self)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self._lock.__exit__(exc_type, exc_value, exc_traceback)
+
+
+def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir:
+    snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident)
+    snapshot_dir_path.mkdir(exist_ok=True, parents=True)
+    return SnapshotDir(snapshot_dir_path)
+
+
 @pytest.fixture(scope="function")
 def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]:
     """
@@ -3542,7 +3742,7 @@ def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[
     The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc).
     """
 
-    if os.getenv("NEON_ENV_BUILDER_FROM_REPO_DIR_USE_OVERLAYFS") is None:
+    if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None:
         return None
 
     overlay_dir = get_test_overlay_dir(request, top_output_dir)
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
new file mode 100644
index 0000000000..bbb4ccee5b
--- /dev/null
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -0,0 +1,85 @@
+import concurrent.futures
+import time
+from typing import Any, Callable, Dict, Tuple
+
+import fixtures.pageserver.remote_storage
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.utils import (
+    wait_until_tenant_state,
+)
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.types import TenantId, TimelineId
+
+
+def single_timeline(
+    neon_env_builder: NeonEnvBuilder,
+    setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+    ncopies: int,
+) -> NeonEnv:
+    """
+    Create `ncopies` duplicates of a template tenant that has a single timeline.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start()
+
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+
+    ps_http = env.pageserver.http_client()
+    # clean up the useless default tenant
+    ps_http.tenant_delete(env.initial_tenant)
+
+    log.info("invoking callback to create template tenant")
+    template_tenant, template_timeline, template_config = setup_template(env)
+    log.info(
+        f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}"
+    )
+
+    log.info("detach template tenant form pageserver")
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+
+    log.info(f"duplicating template tenant {ncopies} times in S3")
+    tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
+
+    log.info("attach duplicated tenants to pageserver")
+    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
+    # However, on-demand downloads are quite slow ATM.
+    # => do the on-demand downloads in Python.
+    assert ps_http.tenant_list() == []
+    # make the attach fail after it created enough on-disk state to retry loading
+    # the tenant next startup, but before it can start background loops that would start download
+    ps_http.configure_failpoints(("attach-before-activate", "return"))
+    env.pageserver.allowed_errors.append(
+        ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
+    )
+
+    def attach_broken(tenant):
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config.copy(),
+        )
+        time.sleep(0.1)
+        wait_until_tenant_state(ps_http, tenant, "Broken", 10)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
+        executor.map(attach_broken, tenants)
+
+    env.pageserver.stop(
+        immediate=True
+    )  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
+    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+    log.info("python-side on-demand download the layer files into local tenant dir")
+    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+        env, tenant_timelines
+    )
+
+    return env
diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py
new file mode 100644
index 0000000000..e6cd9b4614
--- /dev/null
+++ b/test_runner/fixtures/pageserver/remote_storage.py
@@ -0,0 +1,116 @@
+import concurrent.futures
+import os
+import queue
+import shutil
+import threading
+from pathlib import Path
+from typing import Any, List, Tuple
+
+from fixtures.neon_fixtures import NeonEnv, Pagectl
+from fixtures.pageserver.types import (
+    InvalidFileName,
+    parse_layer_file_name,
+)
+from fixtures.remote_storage import LocalFsStorage
+from fixtures.types import TenantId, TimelineId
+
+
+def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+
+    src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines"
+    assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
+
+    assert isinstance(remote_storage, LocalFsStorage)
+    dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines"
+    dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
+    dst_timelines_dir.mkdir(parents=False, exist_ok=False)
+
+    for tl in src_timelines_dir.iterdir():
+        src_tl_dir = src_timelines_dir / tl.name
+        assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
+        dst_tl_dir = dst_timelines_dir / tl.name
+        dst_tl_dir.mkdir(parents=False, exist_ok=False)
+        for file in tl.iterdir():
+            shutil.copy2(file, dst_tl_dir)
+            if "__" in file.name:
+                Pagectl(env).raw_cli(
+                    [
+                        "layer",
+                        "rewrite-summary",
+                        str(dst_tl_dir / file.name),
+                        "--new-tenant-id",
+                        str(new_tenant),
+                    ]
+                )
+            else:
+                # index_part etc need no patching
+                pass
+    return None
+
+
+def duplicate_tenant(env: NeonEnv, template_tenant: TenantId, ncopies: int) -> List[TenantId]:
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    def work(tenant_id):
+        duplicate_one_tenant(env, template_tenant, tenant_id)
+
+    new_tenants: List[TenantId] = [TenantId.generate() for _ in range(0, ncopies)]
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+        executor.map(work, new_tenants)
+    return new_tenants
+
+
+def local_layer_name_from_remote_name(remote_name: str) -> str:
+    try:
+        return parse_layer_file_name(remote_name).to_str()
+    except InvalidFileName as e:
+        comps = remote_name.rsplit("-", 1)
+        if len(comps) == 1:
+            raise InvalidFileName("no generation suffix found") from e
+        else:
+            assert len(comps) == 2
+            layer_file_name, _generation = comps
+            try:
+                return parse_layer_file_name(layer_file_name).to_str()
+            except InvalidFileName:
+                raise
+
+
+def copy_all_remote_layer_files_to_local_tenant_dir(
+    env: NeonEnv, tenant_timelines: List[Tuple[TenantId, TimelineId]]
+):
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+    work: queue.Queue[Any] = queue.Queue()
+    for tenant, timeline in tenant_timelines:
+        remote_timeline_path = remote_storage.timeline_path(tenant, timeline)
+        local_timeline_path = env.pageserver.timeline_dir(tenant, timeline)
+        local_timeline_path.mkdir(parents=True, exist_ok=True)
+        downloads = {}
+        for remote_layer in remote_timeline_path.glob("*__*"):
+            local_name = local_layer_name_from_remote_name(remote_layer.name)
+            assert local_name not in downloads, "remote storage must have had split brain"
+            downloads[local_name] = remote_layer
+        for local_name, remote_path in downloads.items():
+            work.put((remote_path, local_timeline_path / local_name))
+
+    def copy_layer_worker(queue):
+        while True:
+            item = queue.get()
+            if item is None:
+                return
+            remote_path, local_path = item
+            # not copy2, so it looks like a recent download, in case that's relevant to e.g. eviction
+            shutil.copy(remote_path, local_path, follow_symlinks=False)
+
+    workers = []
+    n_threads = os.cpu_count() or 1
+    for _ in range(0, n_threads):
+        w = threading.Thread(target=copy_layer_worker, args=[work])
+        workers.append(w)
+        w.start()
+        work.put(None)
+    for w in workers:
+        w.join()
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py
index b3c1174b35..72fa30a2f2 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -31,10 +31,10 @@ class DeltaLayerFileName:
     key_start: Key
     key_end: Key
 
-    def is_l0(self):
+    def is_l0(self) -> bool:
         return self.key_start == KEY_MIN and self.key_end == KEY_MAX
 
-    def to_str(self):
+    def to_str(self) -> str:
         ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}"
         assert self == parse_layer_file_name(ret)
         return ret
@@ -107,7 +107,7 @@ def parse_layer_file_name(file_name: str) -> LayerFileName:
     except InvalidFileName:
         pass
 
-    raise ValueError()
+    raise InvalidFileName("neither image nor delta layer")
 
 
 def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index a6c4b8e930..972e0b714d 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -63,6 +63,14 @@ def wait_for_upload(
     )
 
 
+def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str):
+    if tenant_info["state"]["slug"] == expected_state:
+        return True
+    if tenant_info["state"]["slug"] == "Broken":
+        raise RuntimeError(f"tenant became Broken, not {expected_state}")
+    return False
+
+
 def wait_until_tenant_state(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
@@ -80,10 +88,8 @@ def wait_until_tenant_state(
             log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
         else:
             log.debug(f"Tenant {tenant_id} data: {tenant}")
-            if tenant["state"]["slug"] == expected_state:
+            if _tenant_in_expected_state(tenant, expected_state):
                 return tenant
-            if tenant["state"]["slug"] == "Broken":
-                raise RuntimeError(f"tenant became Broken, not {expected_state}")
 
         time.sleep(period)
 
@@ -92,6 +98,34 @@ def wait_until_tenant_state(
     )
 
 
+def wait_until_all_tenants_state(
+    pageserver_http: PageserverHttpClient,
+    expected_state: str,
+    iterations: int,
+    period: float = 1.0,
+    http_error_ok: bool = True,
+):
+    """
+    Like wait_until_tenant_state, but checks all tenants.
+    """
+    for _ in range(iterations):
+        try:
+            tenants = pageserver_http.tenant_list()
+        except Exception as e:
+            if http_error_ok:
+                log.debug(f"Failed to list tenants: {e}")
+            else:
+                raise
+        else:
+            if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)):
+                return
+        time.sleep(period)
+
+    raise Exception(
+        f"Not all tenants became active {expected_state} within {iterations * period} seconds"
+    )
+
+
 def wait_until_timeline_state(
     pageserver_http: PageserverHttpClient,
     tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index cda788b2a4..91f33e1196 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -397,3 +397,36 @@ def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
     }
     """
     pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
+
+
+def humantime_to_ms(humantime: str) -> float:
+    """
+    Converts Rust humantime's output string to milliseconds.
+
+    humantime_to_ms("1h 1ms 406us") -> 3600001.406
+    """
+
+    unit_multiplier_map = {
+        "ns": 1e-6,
+        "us": 1e-3,
+        "ms": 1,
+        "s": 1e3,
+        "m": 1e3 * 60,
+        "h": 1e3 * 60 * 60,
+    }
+    matcher = re.compile(rf"^(\d+)({'|'.join(unit_multiplier_map.keys())})$")
+    total_ms = 0.0
+
+    if humantime == "0":
+        return total_ms
+
+    for item in humantime.split():
+        if (match := matcher.search(item)) is not None:
+            n, unit = match.groups()
+            total_ms += int(n) * unit_multiplier_map[unit]
+        else:
+            raise ValueError(
+                f"can't parse '{item}' (from string '{humantime}'), known units are {', '.join(unit_multiplier_map.keys())}."
+            )
+
+    return round(total_ms, 3)
diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md
new file mode 100644
index 0000000000..fdd09cd946
--- /dev/null
+++ b/test_runner/performance/pageserver/README.md
@@ -0,0 +1,16 @@
+How to reproduce benchmark results / run these benchmarks interactively.
+
+1. Get an EC2 instance with Instance Store. Use the same instance type as used for the benchmark run.
+2. Mount the Instance Store => `neon.git/scripts/ps_ec2_setup_instance_store`
+3. Use a pytest command line (see other READMEs further up in the pytest hierarchy).
+
+For tests that take a long time to set up / consume a lot of storage space,
+we use the test suite's repo_dir snapshotting functionality (`from_repo_dir`).
+It supports mounting snapshots using overlayfs, which improves iteration time.
+
+Here's a full command line.
+
+```
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \
+    ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+````
diff --git a/test_runner/performance/pageserver/__init__.py b/test_runner/performance/pageserver/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/performance/pageserver/interactive/__init__.py b/test_runner/performance/pageserver/interactive/__init__.py
new file mode 100644
index 0000000000..29644c240e
--- /dev/null
+++ b/test_runner/performance/pageserver/interactive/__init__.py
@@ -0,0 +1,8 @@
+"""
+Tests that aren't really tests or benchmarks.
+
+They're intended for the case where we want to standardize & automate setup,
+but then debug a performance problem interactively.
+It's kind of an abuse of the test framework, but, it's our only tool right
+now to automate a complex test bench setup.
+"""
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
new file mode 100644
index 0000000000..3fb28ace46
--- /dev/null
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -0,0 +1,79 @@
+import os
+import pdb
+
+import fixtures.pageserver.many_tenants as many_tenants
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+)
+
+from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+
+"""
+Usage:
+DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+    ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+"""
+
+
+@pytest.mark.skipif(
+    os.environ.get("INTERACTIVE", "false") != "true",
+    reason="test is for interactive use only",
+)
+def test_many_small_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    _env = setup_env(neon_env_builder, 2)  # vary this to the desired number of tenants
+    _pg_bin = pg_bin
+
+    # drop into pdb so that we can debug pageserver interactively, use pdb here
+    # For example, to interactively examine pageserver startup behavior, call
+    #   _env.pageserver.stop(immediate=True)
+    #   _env.pageserver.start()
+    # from the pdb shell.
+    pdb.set_trace()
+
+
+def setup_env(
+    neon_env_builder: NeonEnvBuilder,
+    n_tenants: int,
+) -> NeonEnv:
+    def setup_template(env: NeonEnv):
+        # create our template tenant
+        config = {
+            "gc_period": "0s",
+            "checkpoint_timeout": "10 years",
+            "compaction_period": "20 s",
+            "compaction_threshold": 10,
+            "compaction_target_size": 134217728,
+            "checkpoint_distance": 268435456,
+            "image_creation_threshold": 3,
+        }
+        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+        env.pageserver.tenant_detach(template_tenant)
+        env.pageserver.allowed_errors.append(
+            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            ".*Dropped remote consistent LSN updates.*",
+        )
+        env.pageserver.tenant_attach(template_tenant, config)
+        ep = env.endpoints.create_start("main", tenant_id=template_tenant)
+        ep.safe_psql("create table foo(b text)")
+        for _ in range(0, 8):
+            ep.safe_psql("insert into foo(b) values ('some text')")
+            last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
+        ep.stop_and_destroy()
+        return (template_tenant, template_timeline, config)
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(f"many-small-tenants-{n_tenants}", doit)
+
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+
+    return env
diff --git a/test_runner/performance/pageserver/pagebench/__init__.py b/test_runner/performance/pageserver/pagebench/__init__.py
new file mode 100644
index 0000000000..9f5e45c0a0
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/__init__.py
@@ -0,0 +1,10 @@
+"""
+Pagebench-based performance regression tests.
+
+The defining characteristic of tests in this sub-directory is that they
+are component-level tests, i.e., they exercise pageserver directly using `pagebench`
+instead of benchmarking the full stack.
+
+See https://github.com/neondatabase/neon/issues/5771
+for the context in which this was developed.
+"""
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
new file mode 100644
index 0000000000..f1f5511453
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -0,0 +1,210 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import fixtures.pageserver.many_tenants as many_tenants
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    wait_for_last_flush_lsn,
+)
+from fixtures.utils import get_scale_for_db, humantime_to_ms
+
+from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+
+
+# For reference, the space usage of the snapshots:
+# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
+# 137G    /instance_store/test_output/shared-snapshots
+# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
+# 1.8G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
+# 1.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
+# 8.5G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
+# 5.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
+# 76G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
+# 46G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
+@pytest.mark.parametrize("n_tenants", [1, 10, 100])
+@pytest.mark.timeout(
+    10000
+)  # TODO: this value is just "a really high number"; have this per instance type
+def test_pageserver_max_throughput_getpage_at_latest_lsn(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(
+            metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
+        )
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            "n_tenants": (n_tenants, {"unit": ""}),
+            "pgbench_scale": (pgbench_scale, {"unit": ""}),
+            "duration": (duration, {"unit": "s"}),
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+    env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
+    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
+
+
+def run_benchmark_max_throughput_latest_lsn(
+    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
+):
+    """
+    Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
+    """
+
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "get-page-latest-lsn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--runtime",
+        f"{duration_secs}s",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+
+    metric = "request_count"
+    record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "latency_mean"
+    record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+
+def setup_pageserver_with_pgbench_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    n_tenants: int,
+    scale: int,
+) -> NeonEnv:
+    """
+    Utility function to set up a pageserver with a given number of identical tenants.
+    Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
+    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
+    """
+
+    def setup_template(env: NeonEnv):
+        # use a config that makes production of on-disk state timing-insensitive
+        # as we ingest data into the tenant.
+        config = {
+            "gc_period": "0s",  # disable periodic gc
+            "checkpoint_timeout": "10 years",
+            "compaction_period": "0s",  # disable periodic compaction
+            "compaction_threshold": 10,
+            "compaction_target_size": 134217728,
+            "checkpoint_distance": 268435456,
+            "image_creation_threshold": 3,
+        }
+        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+        env.pageserver.tenant_detach(template_tenant)
+        env.pageserver.allowed_errors.append(
+            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            ".*Dropped remote consistent LSN updates.*",
+        )
+        env.pageserver.tenant_attach(template_tenant, config)
+        ps_http = env.pageserver.http_client()
+        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+            pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
+            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+            ps_http.timeline_checkpoint(template_tenant, template_timeline)
+            ps_http.timeline_compact(template_tenant, template_timeline)
+            for _ in range(
+                0, 17
+            ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
+                # the L0s produced by this appear to have size ~5MiB
+                num_txns = 10_000
+                pg_bin.run_capture(
+                    ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
+                )
+                wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+                ps_http.timeline_checkpoint(template_tenant, template_timeline)
+                ps_http.timeline_compact(template_tenant, template_timeline)
+        # for reference, the output at scale=6 looked like so (306M total)
+        # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
+        # total 306M
+        # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
+        # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
+        #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
+        #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
+        #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
+        # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
+        # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
+        # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
+        # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
+        # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
+
+        return (template_tenant, template_timeline, config)
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(
+        f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
+    )
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+    return env
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
new file mode 100644
index 0000000000..45eb652362
--- /dev/null
+++ b/test_runner/performance/pageserver/util.py
@@ -0,0 +1,29 @@
+"""
+Utilities used by all code in this sub-directory
+"""
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pageserver.utils import wait_until_all_tenants_state
+
+
+def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
+    """
+    Helper function.
+    """
+    ps_http = env.pageserver.http_client()
+
+    log.info("wait for all tenants to become active")
+    wait_until_all_tenants_state(
+        ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
+    )
+
+    # ensure all layers are resident for predictiable performance
+    tenants = [info["id"] for info in ps_http.tenant_list()]
+    for tenant in tenants:
+        for timeline in ps_http.tenant_status(tenant)["timelines"]:
+            info = ps_http.layer_map_info(tenant, timeline)
+            for layer in info.historic_layers:
+                assert not layer.remote
+
+    log.info("ready")
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index d548e63cc1..8d5ef4e3c4 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -482,7 +482,7 @@ def test_detach_while_attaching(
     pageserver_http.tenant_detach(tenant_id)
 
     # And re-attach
-    pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
+    pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(5000)")])
 
     env.pageserver.tenant_attach(tenant_id)
 
@@ -681,7 +681,7 @@ def test_detach_while_activating(
     pageserver_http.tenant_detach(tenant_id)
 
     # And re-attach, but stop attach task_mgr task from completing
-    pageserver_http.configure_failpoints([("attach-before-activate", "return(600000)")])
+    pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(600000)")])
     env.pageserver.tenant_attach(tenant_id)
 
     # The tenant is in the Activating state.  This should not block us from
@@ -695,7 +695,7 @@ def test_detach_while_activating(
     ), "Only ignored tenant should be missing"
 
     # Subsequently attaching it again should still work
-    pageserver_http.configure_failpoints([("attach-before-activate", "off")])
+    pageserver_http.configure_failpoints([("attach-before-activate-sleep", "off")])
     env.pageserver.tenant_attach(tenant_id)
     wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
 

From d820aa1d08c8a0e7f3fb2891640fa6ef6e357394 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Jan 2024 13:06:05 +0100
Subject: [PATCH 44/70] Disable initdb cancellation (#6451)

## Problem

The initdb cancellation added in #5921 is not sufficient to reliably
abort the entire initdb process. Initdb also spawns children. The tests
added by #6310 (#6385) and #6436 now do initdb cancellations on a more
regular basis.

In #6385, I attempted to issue `killpg` (after giving it a new process
group ID) to kill not just the initdb but all its spawned subprocesses,
but this didn't work. Initdb doesn't take *that* long in the end either,
so we just wait until it concludes.

## Summary of changes

* revert initdb cancellation support added in #5921
* still return `Err(Cancelled)` upon cancellation, but this is just to
not have to remove the cancellation infrastructure
* fixes to the `test_tenant_delete_races_timeline_creation` test to make
it reliably pass

Fixes #6385
---
 pageserver/src/tenant.rs                  | 37 +++++++++++------------
 test_runner/fixtures/pageserver/utils.py  | 18 ++++++++++-
 test_runner/regress/test_tenant_delete.py | 17 ++++++++---
 3 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d4d4208ac2..31af54d146 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -91,7 +91,6 @@ use std::fs;
 use std::fs::File;
 use std::io;
 use std::ops::Bound::Included;
-use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
@@ -3762,27 +3761,25 @@ async fn run_initdb(
         .env_clear()
         .env("LD_LIBRARY_PATH", &initdb_lib_dir)
         .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        // If the `select!` below doesn't finish the `wait_with_output`,
-        // let the task get `wait()`ed for asynchronously by tokio.
-        // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
-        // TODO: fix for this is non-trivial, see
-        // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
-        //
-        .kill_on_drop(true)
         .spawn()?;
 
-    tokio::select! {
-        initdb_output = initdb_command.wait_with_output() => {
-            let initdb_output = initdb_output?;
-            if !initdb_output.status.success() {
-                return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
-            }
-        }
-        _ = cancel.cancelled() => {
-            return Err(InitdbError::Cancelled);
-        }
+    // Ideally we'd select here with the cancellation token, but the problem is that
+    // we can't safely terminate initdb: it launches processes of its own, and killing
+    // initdb doesn't kill them. After we return from this function, we want the target
+    // directory to be able to be cleaned up.
+    // See https://github.com/neondatabase/neon/issues/6385
+    let initdb_output = initdb_command.wait_with_output().await?;
+    if !initdb_output.status.success() {
+        return Err(InitdbError::Failed(
+            initdb_output.status,
+            initdb_output.stderr,
+        ));
+    }
+
+    // This isn't true cancellation support, see above. Still return an error to
+    // excercise the cancellation code path.
+    if cancel.is_cancelled() {
+        return Err(InitdbError::Cancelled);
     }
 
     Ok(())
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 972e0b714d..6b2651e447 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -371,8 +371,24 @@ def tenant_delete_wait_completed(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
     iterations: int,
+    ignore_errors: bool = False,
 ):
-    pageserver_http.tenant_delete(tenant_id=tenant_id)
+    if not ignore_errors:
+        pageserver_http.tenant_delete(tenant_id=tenant_id)
+    else:
+        interval = 0.5
+
+        def delete_request_sent():
+            try:
+                pageserver_http.tenant_delete(tenant_id=tenant_id)
+            except PageserverApiException as e:
+                log.debug(e)
+                if e.status_code == 404:
+                    return
+            except Exception as e:
+                log.debug(e)
+
+        wait_until(iterations, interval=interval, func=delete_request_sent)
     wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
 
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 7a5b1c0fc2..bafc5711a1 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -578,6 +578,9 @@ def test_tenant_delete_races_timeline_creation(
         ".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
     )
 
+    # This can occur sometimes.
+    CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
+
     env.pageserver.allowed_errors.extend(
         [
             # lucky race with stopping from flushing a layer we fail to schedule any uploads
@@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation(
             ".*POST.*/timeline.* request was dropped before completing",
             # Timeline creation runs into this error
             CANCELLED_ERROR,
+            # Timeline deletion can run into this error during deletion
+            CONFLICT_MESSAGE,
+            ".*tenant_delete_handler.*still waiting, taking longer than expected.*",
         ]
     )
 
@@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation(
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
 
     def tenant_delete():
-        ps_http.tenant_delete(tenant_id)
+        def tenant_delete_inner():
+            ps_http.tenant_delete(tenant_id)
+
+        wait_until(100, 0.5, tenant_delete_inner)
 
     Thread(target=tenant_delete).start()
 
@@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation(
     ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
 
     iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    try:
-        tenant_delete_wait_completed(ps_http, tenant_id, iterations)
-    except PageserverApiException:
-        pass
+
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
 
     # Physical deletion should have happened
     assert_prefix_empty(

From a0a3ba85e7e2c086cc5c4dcf358a31faf926ee80 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 24 Jan 2024 17:47:17 +0200
Subject: [PATCH 45/70] fix(page_service): walredo logging problem (#6460)

Fixes: #6459 by formatting full causes of an error to log, while keeping
the top level string for end-user.

Changes user visible error detail from:

```
-DETAIL:  page server returned error: Read error: Failed to reconstruct a page image:
+DETAIL:  page server returned error: Read error
```

However on pageserver logs:

```
-ERROR page_service_conn_main{...}: error reading relation or page version: Read error: Failed to reconstruct a page image:
+ERROR page_service_conn_main{...}: error reading relation or page version: Read error: reconstruct a page image: launch walredo process: spawn process: Permission denied (os error 13)
```
---
 pageserver/src/page_service.rs    | 13 ++++++++-----
 pageserver/src/tenant/timeline.rs |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d9dba137e4..a8a3487b4e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -322,8 +322,8 @@ enum PageStreamError {
     Shutdown,
 
     /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error: {0}")]
-    Read(PageReconstructError),
+    #[error("Read error")]
+    Read(#[source] PageReconstructError),
 
     /// Ran out of time waiting for an LSN
     #[error("LSN timeout: {0}")]
@@ -332,11 +332,11 @@ enum PageStreamError {
     /// The entity required to serve the request (tenant or timeline) is not found,
     /// or is not found in a suitable state to serve a request.
     #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
+    NotFound(Cow<'static, str>),
 
     /// Request asked for something that doesn't make sense, like an invalid LSN
     #[error("Bad request: {0}")]
-    BadRequest(std::borrow::Cow<'static, str>),
+    BadRequest(Cow<'static, str>),
 }
 
 impl From<PageReconstructError> for PageStreamError {
@@ -667,7 +667,10 @@ impl PageServerHandler {
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
-                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
                         PagestreamBeMessage::Error(PagestreamErrorResponse {
                             message: e.to_string(),
                         })
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 603ae3b83a..666dae94e2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4388,7 +4388,7 @@ impl Timeline {
                     .walredo_mgr
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                     .await
-                    .context("Failed to reconstruct a page image:")
+                    .context("reconstruct a page image")
                 {
                     Ok(img) => img,
                     Err(e) => return Err(PageReconstructError::WalRedo(e)),

From 210700d0d953b5f8045ab1ac17c064d65865f735 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 24 Jan 2024 16:38:10 +0000
Subject: [PATCH 46/70] proxy: add newtype wrappers for string based IDs
 (#6445)

## Problem

too many string based IDs. easy to mix up ID types.

## Summary of changes

Add a bunch of `SmolStr` wrappers that provide convenience methods but
are type safe
---
 Cargo.lock                            |  4 +-
 proxy/src/auth/backend.rs             | 19 ++++---
 proxy/src/auth/credentials.rs         | 59 +++++++++++----------
 proxy/src/auth/password_hack.rs       |  5 +-
 proxy/src/cache/project_info.rs       | 64 +++++++++++-----------
 proxy/src/console/messages.rs         | 11 ++--
 proxy/src/console/provider.rs         | 11 ++--
 proxy/src/console/provider/neon.rs    |  5 +-
 proxy/src/context.rs                  | 17 +++---
 proxy/src/lib.rs                      | 76 +++++++++++++++++++++++++++
 proxy/src/proxy.rs                    | 12 +++--
 proxy/src/rate_limiter/limiter.rs     | 15 +++---
 proxy/src/redis/notifications.rs      |  9 ++--
 proxy/src/serverless/conn_pool.rs     | 20 ++++---
 proxy/src/serverless/sql_over_http.rs |  7 +--
 proxy/src/usage_metrics.rs            |  7 ++-
 16 files changed, 215 insertions(+), 126 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 02a437ccf9..f2f31192f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5113,9 +5113,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
 
 [[package]]
 name = "smol_str"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
+checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
 dependencies = [
  "serde",
 ]
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 1e03510119..b1634906c9 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,7 +3,6 @@ mod hacks;
 mod link;
 
 pub use link::LinkAuthError;
-use smol_str::SmolStr;
 use tokio_postgres::config::AuthKeys;
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
@@ -16,7 +15,6 @@ use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
 use crate::proxy::NeonOptions;
-use crate::scram;
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -28,6 +26,7 @@ use crate::{
     },
     stream, url,
 };
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use futures::TryFutureExt;
 use std::borrow::Cow;
 use std::ops::ControlFlow;
@@ -130,19 +129,19 @@ pub struct ComputeCredentials<T> {
 
 #[derive(Debug, Clone)]
 pub struct ComputeUserInfoNoEndpoint {
-    pub user: SmolStr,
+    pub user: RoleName,
     pub options: NeonOptions,
 }
 
 #[derive(Debug, Clone)]
 pub struct ComputeUserInfo {
-    pub endpoint: SmolStr,
-    pub user: SmolStr,
+    pub endpoint: EndpointId,
+    pub user: RoleName,
     pub options: NeonOptions,
 }
 
 impl ComputeUserInfo {
-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
         self.options.get_cache_key(&self.endpoint)
     }
 }
@@ -158,7 +157,7 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
     type Error = ComputeUserInfoNoEndpoint;
 
     fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
-        match user_info.project {
+        match user_info.endpoint_id {
             None => Err(ComputeUserInfoNoEndpoint {
                 user: user_info.user,
                 options: user_info.options,
@@ -317,11 +316,11 @@ async fn auth_and_wake_compute(
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<SmolStr> {
+    pub fn get_endpoint(&self) -> Option<EndpointId> {
         use BackendType::*;
 
         match self {
-            Console(_, user_info) => user_info.project.clone(),
+            Console(_, user_info) => user_info.endpoint_id.clone(),
             Link(_) => Some("link".into()),
             #[cfg(test)]
             Test(_) => Some("test".into()),
@@ -355,7 +354,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
             Console(api, user_info) => {
                 info!(
                     user = &*user_info.user,
-                    project = user_info.project(),
+                    project = user_info.endpoint(),
                     "performing authentication using the console"
                 );
 
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 342fd6fce9..bdb79f2517 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,7 @@
 
 use crate::{
     auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -21,7 +21,10 @@ pub enum ComputeUserInfoParseError {
          SNI ('{}') and project option ('{}').",
         .domain, .option,
     )]
-    InconsistentProjectNames { domain: SmolStr, option: SmolStr },
+    InconsistentProjectNames {
+        domain: EndpointId,
+        option: EndpointId,
+    },
 
     #[error(
         "Common name inferred from SNI ('{}') is not known",
@@ -30,7 +33,7 @@ pub enum ComputeUserInfoParseError {
     UnknownCommonName { cn: String },
 
     #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
-    MalformedProjectName(SmolStr),
+    MalformedProjectName(EndpointId),
 }
 
 impl UserFacingError for ComputeUserInfoParseError {}
@@ -39,17 +42,15 @@ impl UserFacingError for ComputeUserInfoParseError {}
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ComputeUserInfoMaybeEndpoint {
-    pub user: SmolStr,
-    // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<SmolStr>,
-
+    pub user: RoleName,
+    pub endpoint_id: Option<EndpointId>,
     pub options: NeonOptions,
 }
 
 impl ComputeUserInfoMaybeEndpoint {
     #[inline]
-    pub fn project(&self) -> Option<&str> {
-        self.project.as_deref()
+    pub fn endpoint(&self) -> Option<&str> {
+        self.endpoint_id.as_deref()
     }
 }
 
@@ -79,15 +80,15 @@ impl ComputeUserInfoMaybeEndpoint {
 
         // Some parameters are stored in the startup message.
         let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user: SmolStr = get_param("user")?.into();
+        let user: RoleName = get_param("user")?.into();
 
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(SmolStr::from));
+        ctx.set_endpoint_id(sni.map(EndpointId::from));
 
         // Project name might be passed via PG's command-line options.
-        let project_option = params
+        let endpoint_option = params
             .options_raw()
             .and_then(|options| {
                 // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
@@ -100,9 +101,9 @@ impl ComputeUserInfoMaybeEndpoint {
             })
             .map(|name| name.into());
 
-        let project_from_domain = if let Some(sni_str) = sni {
+        let endpoint_from_domain = if let Some(sni_str) = sni {
             if let Some(cn) = common_names {
-                Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
+                Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
             } else {
                 None
             }
@@ -110,7 +111,7 @@ impl ComputeUserInfoMaybeEndpoint {
             None
         };
 
-        let project = match (project_option, project_from_domain) {
+        let endpoint = match (endpoint_option, endpoint_from_domain) {
             // Invariant: if we have both project name variants, they should match.
             (Some(option), Some(domain)) if option != domain => {
                 Some(Err(InconsistentProjectNames { domain, option }))
@@ -123,13 +124,13 @@ impl ComputeUserInfoMaybeEndpoint {
         }
         .transpose()?;
 
-        info!(%user, project = project.as_deref(), "credentials");
+        info!(%user, project = endpoint.as_deref(), "credentials");
         if sni.is_some() {
             info!("Connection with sni");
             NUM_CONNECTION_ACCEPTED_BY_SNI
                 .with_label_values(&["sni"])
                 .inc();
-        } else if project.is_some() {
+        } else if endpoint.is_some() {
             NUM_CONNECTION_ACCEPTED_BY_SNI
                 .with_label_values(&["no_sni"])
                 .inc();
@@ -145,7 +146,7 @@ impl ComputeUserInfoMaybeEndpoint {
 
         Ok(Self {
             user,
-            project,
+            endpoint_id: endpoint.map(EndpointId::from),
             options,
         })
     }
@@ -238,7 +239,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);
 
         Ok(())
     }
@@ -253,7 +254,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);
 
         Ok(())
     }
@@ -269,7 +270,7 @@ mod tests {
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("foo"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
         assert_eq!(user_info.options.get_cache_key("foo"), "foo");
 
         Ok(())
@@ -285,7 +286,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
         Ok(())
     }
@@ -300,7 +301,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
         Ok(())
     }
@@ -318,7 +319,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());
 
         Ok(())
     }
@@ -333,7 +334,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());
 
         Ok(())
     }
@@ -349,7 +350,7 @@ mod tests {
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("baz"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
 
         Ok(())
     }
@@ -363,14 +364,14 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
         let mut ctx = RequestMonitoring::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         Ok(())
     }
@@ -427,7 +428,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("project"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
         assert_eq!(
             user_info.options.get_cache_key("project"),
             "project endpoint_type:read_write lsn:0/2"
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 372b0764ee..2ddf46fe25 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -4,10 +4,11 @@
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.
 
 use bstr::ByteSlice;
-use smol_str::SmolStr;
+
+use crate::EndpointId;
 
 pub struct PasswordHackPayload {
-    pub endpoint: SmolStr,
+    pub endpoint: EndpointId,
     pub password: Vec<u8>,
 }
 
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index fa3d5d0e31..6f37868a8c 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -11,13 +11,16 @@ use smol_str::SmolStr;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
-use crate::{auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret};
+use crate::{
+    auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
+    RoleName,
+};
 
 use super::{Cache, Cached};
 
 pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
     fn enable_ttl(&self);
     fn disable_ttl(&self);
 }
@@ -44,7 +47,7 @@ impl<T> From<T> for Entry<T> {
 
 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<SmolStr, Entry<Option<AuthSecret>>>,
+    secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
     allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }
 
@@ -57,7 +60,7 @@ impl EndpointInfo {
     }
     pub fn get_role_secret(
         &self,
-        role_name: &SmolStr,
+        role_name: &RoleName,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
     ) -> Option<(Option<AuthSecret>, bool)> {
@@ -90,7 +93,7 @@ impl EndpointInfo {
     pub fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
-    pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
+    pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
         self.secret.remove(role_name);
     }
 }
@@ -103,9 +106,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<SmolStr, EndpointInfo>,
+    cache: DashMap<EndpointId, EndpointInfo>,
 
-    project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
+    project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -113,7 +116,7 @@ pub struct ProjectInfoCacheImpl {
 }
 
 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
         info!("invalidating allowed ips for project `{}`", project_id);
         let endpoints = self
             .project2ep
@@ -126,7 +129,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
         info!(
             "invalidating role secret for project_id `{}` and role_name `{}`",
             project_id, role_name
@@ -167,8 +170,8 @@ impl ProjectInfoCacheImpl {
 
     pub fn get_role_secret(
         &self,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
     ) -> Option<Cached<&Self, Option<AuthSecret>>> {
         let (valid_since, ignore_cache_since) = self.get_cache_times();
         let endpoint_info = self.cache.get(endpoint_id)?;
@@ -188,7 +191,7 @@ impl ProjectInfoCacheImpl {
     }
     pub fn get_allowed_ips(
         &self,
-        endpoint_id: &SmolStr,
+        endpoint_id: &EndpointId,
     ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
         let (valid_since, ignore_cache_since) = self.get_cache_times();
         let endpoint_info = self.cache.get(endpoint_id)?;
@@ -205,9 +208,9 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_role_secret(
         &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
         secret: Option<AuthSecret>,
     ) {
         if self.cache.len() >= self.config.size {
@@ -222,8 +225,8 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_allowed_ips(
         &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
         allowed_ips: Arc<Vec<IpPattern>>,
     ) {
         if self.cache.len() >= self.config.size {
@@ -236,7 +239,7 @@ impl ProjectInfoCacheImpl {
             .or_default()
             .allowed_ips = Some(allowed_ips.into());
     }
-    fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
+    fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
         if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
             endpoints.insert(endpoint_id.clone());
         } else {
@@ -297,18 +300,18 @@ impl ProjectInfoCacheImpl {
 /// This is used to invalidate cache entries.
 pub struct CachedLookupInfo {
     /// Search by this key.
-    endpoint_id: SmolStr,
+    endpoint_id: EndpointId,
     lookup_type: LookupType,
 }
 
 impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
+    pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::RoleSecret(role_name),
         }
     }
-    pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::AllowedIps,
@@ -317,7 +320,7 @@ impl CachedLookupInfo {
 }
 
 enum LookupType {
-    RoleSecret(SmolStr),
+    RoleSecret(RoleName),
     AllowedIps,
 }
 
@@ -348,7 +351,6 @@ impl Cache for ProjectInfoCacheImpl {
 mod tests {
     use super::*;
     use crate::{console::AuthSecret, scram::ServerSecret};
-    use smol_str::SmolStr;
     use std::{sync::Arc, time::Duration};
 
     #[tokio::test]
@@ -362,8 +364,8 @@ mod tests {
         });
         let project_id = "project".into();
         let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
             user1.as_str(),
             [1; 32],
@@ -385,7 +387,7 @@ mod tests {
         assert_eq!(cached.value, secret2);
 
         // Shouldn't add more than 2 roles.
-        let user3: SmolStr = "user3".into();
+        let user3: RoleName = "user3".into();
         let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
             user3.as_str(),
             [3; 32],
@@ -420,8 +422,8 @@ mod tests {
 
         let project_id = "project".into();
         let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
             user1.as_str(),
             [1; 32],
@@ -475,8 +477,8 @@ mod tests {
 
         let project_id = "project".into();
         let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
             user1.as_str(),
             [1; 32],
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 1cfa2d6192..6ef9bcf4eb 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,9 +1,10 @@
 use serde::Deserialize;
-use smol_str::SmolStr;
 use std::fmt;
 
 use crate::auth::IpPattern;
 
+use crate::{BranchId, EndpointId, ProjectId};
+
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
@@ -17,7 +18,7 @@ pub struct ConsoleError {
 pub struct GetRoleSecret {
     pub role_secret: Box<str>,
     pub allowed_ips: Option<Vec<IpPattern>>,
-    pub project_id: Option<Box<str>>,
+    pub project_id: Option<ProjectId>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -94,9 +95,9 @@ impl fmt::Debug for DatabaseInfo {
 /// Also known as `ProxyMetricsAuxInfo` in the console.
 #[derive(Debug, Deserialize, Clone, Default)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: SmolStr,
-    pub project_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub project_id: ProjectId,
+    pub branch_id: BranchId,
 }
 
 impl MetricsAuxInfo {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 53c394f52f..a6dfbd79db 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -9,11 +9,10 @@ use crate::{
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
-    scram,
+    scram, EndpointCacheKey, ProjectId,
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
-use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
@@ -214,7 +213,7 @@ pub struct AuthInfo {
     /// List of IP addresses allowed for the autorization.
     pub allowed_ips: Vec<IpPattern>,
     /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<SmolStr>,
+    pub project_id: Option<ProjectId>,
 }
 
 /// Info for establishing a connection to a compute node.
@@ -233,7 +232,7 @@ pub struct NodeInfo {
     pub allow_self_signed_compute: bool,
 }
 
-pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
+pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
@@ -345,7 +344,7 @@ impl ApiCaches {
 /// Various caches for [`console`](super).
 pub struct ApiLocks {
     name: &'static str,
-    node_locks: DashMap<SmolStr, Arc<Semaphore>>,
+    node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
     registered: prometheus::IntCounter,
@@ -413,7 +412,7 @@ impl ApiLocks {
 
     pub async fn get_wake_compute_permit(
         &self,
-        key: &SmolStr,
+        key: &EndpointCacheKey,
     ) -> Result<WakeComputePermit, errors::WakeComputeError> {
         if self.permits == 0 {
             return Ok(WakeComputePermit { permit: None });
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 6574e079d5..33618faed8 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,7 +14,6 @@ use crate::{
 };
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use smol_str::SmolStr;
 use std::sync::Arc;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
@@ -98,7 +97,7 @@ impl Api {
             Ok(AuthInfo {
                 secret,
                 allowed_ips,
-                project_id: body.project_id.map(SmolStr::from),
+                project_id: body.project_id,
             })
         }
         .map_err(crate::error::log_error)
@@ -239,7 +238,7 @@ impl super::Api for Api {
         // for some time (highly depends on the console's scale-to-zero policy);
         // The connection info remains the same during that period of time,
         // which means that we might cache it to reduce the load and latency.
-        if let Some(cached) = self.caches.node_info.get(&*key) {
+        if let Some(cached) = self.caches.node_info.get(&key) {
             info!(key = &*key, "found cached compute node info");
             return Ok(cached);
         }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 8a1aa4aec9..9e2ea10031 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,7 +7,10 @@ use std::net::IpAddr;
 use tokio::sync::mpsc;
 use uuid::Uuid;
 
-use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
+use crate::{
+    console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
+    EndpointId, ProjectId, RoleName,
+};
 
 pub mod parquet;
 
@@ -26,10 +29,10 @@ pub struct RequestMonitoring {
     region: &'static str,
 
     // filled in as they are discovered
-    project: Option<SmolStr>,
-    branch: Option<SmolStr>,
-    endpoint_id: Option<SmolStr>,
-    user: Option<SmolStr>,
+    project: Option<ProjectId>,
+    branch: Option<BranchId>,
+    endpoint_id: Option<EndpointId>,
+    user: Option<RoleName>,
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
     success: bool,
@@ -86,7 +89,7 @@ impl RequestMonitoring {
         self.project = Some(x.project_id);
     }
 
-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
+    pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
         self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
     }
 
@@ -94,7 +97,7 @@ impl RequestMonitoring {
         self.application = app.or_else(|| self.application.clone());
     }
 
-    pub fn set_user(&mut self, user: SmolStr) {
+    pub fn set_user(&mut self, user: RoleName) {
         self.user = Some(user);
     }
 
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a22b2459b8..a9e4a38302 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -62,3 +62,79 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
 pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
     r.context("join error").and_then(|x| x)
 }
+
+macro_rules! smol_str_wrapper {
+    ($name:ident) => {
+        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+        pub struct $name(smol_str::SmolStr);
+
+        impl $name {
+            pub fn as_str(&self) -> &str {
+                self.0.as_str()
+            }
+        }
+
+        impl std::fmt::Display for $name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                self.0.fmt(f)
+            }
+        }
+
+        impl<T> std::cmp::PartialEq<T> for $name
+        where
+            smol_str::SmolStr: std::cmp::PartialEq<T>,
+        {
+            fn eq(&self, other: &T) -> bool {
+                self.0.eq(other)
+            }
+        }
+
+        impl<T> From<T> for $name
+        where
+            smol_str::SmolStr: From<T>,
+        {
+            fn from(x: T) -> Self {
+                Self(x.into())
+            }
+        }
+
+        impl AsRef<str> for $name {
+            fn as_ref(&self) -> &str {
+                self.0.as_ref()
+            }
+        }
+
+        impl std::ops::Deref for $name {
+            type Target = str;
+            fn deref(&self) -> &str {
+                &*self.0
+            }
+        }
+
+        impl<'de> serde::de::Deserialize<'de> for $name {
+            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
+            }
+        }
+
+        impl serde::Serialize for $name {
+            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+                self.0.serialize(s)
+            }
+        }
+    };
+}
+
+// 90% of role name strings are 20 characters or less.
+smol_str_wrapper!(RoleName);
+// 50% of endpoint strings are 23 characters or less.
+smol_str_wrapper!(EndpointId);
+// 50% of branch strings are 23 characters or less.
+smol_str_wrapper!(BranchId);
+// 90% of project strings are 23 characters or less.
+smol_str_wrapper!(ProjectId);
+
+// will usually equal endpoint ID
+smol_str_wrapper!(EndpointCacheKey);
+
+smol_str_wrapper!(DbName);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 635d157383..087cc7f7a9 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,6 +19,7 @@ use crate::{
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     usage_metrics::{Ids, USAGE_METRICS},
+    EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
@@ -26,7 +27,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use regex::Regex;
-use smol_str::SmolStr;
+use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
@@ -516,20 +517,21 @@ impl NeonOptions {
         Self(options)
     }
 
-    pub fn get_cache_key(&self, prefix: &str) -> SmolStr {
+    pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
         // prefix + format!(" {k}:{v}")
         // kinda jank because SmolStr is immutable
         std::iter::once(prefix)
             .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v]))
-            .collect()
+            .collect::<SmolStr>()
+            .into()
     }
 
     /// <https://swagger.io/docs/specification/serialization/> DeepObject format
     /// `paramName[prop1]=value1&paramName[prop2]=value2&...`
-    pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> {
+    pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
         self.0
             .iter()
-            .map(|(k, v)| (format!("options[{}]", k), v.clone()))
+            .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
             .collect()
     }
 }
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index a190b2cf8f..cbae72711c 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -11,11 +11,12 @@ use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
+use crate::EndpointId;
+
 use super::{
     limit_algorithm::{LimitAlgorithm, Sample},
     RateLimiterConfig,
@@ -33,7 +34,7 @@ use super::{
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
 pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
+    map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
     info: &'static [RateBucketInfo],
     access_count: AtomicUsize,
     rand: Mutex<Rand>,
@@ -146,7 +147,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, endpoint: SmolStr) -> bool {
+    pub fn check(&self, endpoint: EndpointId) -> bool {
         // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
         // worst case memory usage is about:
         //    = 2 * 2048 * 64 * (48B + 72B)
@@ -493,11 +494,13 @@ mod tests {
     use futures::{task::noop_waker_ref, Future};
     use rand::SeedableRng;
     use rustc_hash::FxHasher;
-    use smol_str::SmolStr;
     use tokio::time;
 
     use super::{EndpointRateLimiter, Limiter, Outcome};
-    use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
+    use crate::{
+        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
+        EndpointId,
+    };
 
     #[tokio::test]
     async fn it_works() {
@@ -654,7 +657,7 @@ mod tests {
         RateBucketInfo::validate(&mut rates).unwrap();
         let limiter = EndpointRateLimiter::new(Vec::leak(rates));
 
-        let endpoint = SmolStr::from("ep-my-endpoint-1234");
+        let endpoint = EndpointId::from("ep-my-endpoint-1234");
 
         time::pause();
 
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index d28dcbd1a7..9cd70b109b 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -3,9 +3,8 @@ use std::{convert::Infallible, sync::Arc};
 use futures::StreamExt;
 use redis::aio::PubSub;
 use serde::Deserialize;
-use smol_str::SmolStr;
 
-use crate::cache::project_info::ProjectInfoCache;
+use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};
 
 const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -46,12 +45,12 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    project_id: SmolStr,
+    project_id: ProjectId,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    project_id: SmolStr,
-    role_name: SmolStr,
+    project_id: ProjectId,
+    role_name: RoleName,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c07cc2816e..5a7279ae63 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -31,6 +31,7 @@ use crate::{
     metrics::NUM_DB_CONNECTIONS_GAUGE,
     proxy::connect_compute::ConnectMechanism,
     usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
+    DbName, EndpointCacheKey, RoleName,
 };
 use crate::{compute, config};
 
@@ -42,17 +43,17 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
-    pub dbname: SmolStr,
+    pub dbname: DbName,
     pub password: SmolStr,
 }
 
 impl ConnInfo {
     // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
+    pub fn db_and_user(&self) -> (DbName, RoleName) {
         (self.dbname.clone(), self.user_info.user.clone())
     }
 
-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
         self.user_info.endpoint_cache_key()
     }
 }
@@ -79,14 +80,14 @@ struct ConnPoolEntry {
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
+    pools: HashMap<(DbName, RoleName), DbUserConnPool>,
     total_conns: usize,
     max_conns: usize,
     _guard: IntCounterPairGuard,
 }
 
 impl EndpointConnPool {
-    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry> {
         let Self {
             pools, total_conns, ..
         } = self;
@@ -95,7 +96,7 @@ impl EndpointConnPool {
             .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
     }
 
-    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
+    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
         let Self {
             pools, total_conns, ..
         } = self;
@@ -196,7 +197,7 @@ pub struct GlobalConnPool {
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
 
     /// Number of endpoint-connection pools
     ///
@@ -440,7 +441,10 @@ impl GlobalConnPool {
         Ok(Client::new(new_client, conn_info, endpoint_pool).await)
     }
 
-    fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(
+        &self,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 9b32ae7f25..f108ab34ab 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -13,7 +13,6 @@ use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
-use smol_str::SmolStr;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::types::Kind;
@@ -36,6 +35,8 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::EndpointId;
+use crate::RoleName;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
@@ -155,7 +156,7 @@ fn get_conn_info(
         .next()
         .ok_or(anyhow::anyhow!("invalid database name"))?;
 
-    let username = SmolStr::from(connection_url.username());
+    let username = RoleName::from(connection_url.username());
     if username.is_empty() {
         return Err(anyhow::anyhow!("missing username"));
     }
@@ -189,7 +190,7 @@ fn get_conn_info(
 
     let endpoint = endpoint_sni(hostname, &tls.common_names)?;
 
-    let endpoint: SmolStr = endpoint.into();
+    let endpoint: EndpointId = endpoint.into();
     ctx.set_endpoint_id(Some(endpoint.clone()));
 
     let pairs = connection_url.query_pairs();
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 789a4c680c..d75aedf89b 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,12 +1,11 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{config::MetricCollectionConfig, http};
+use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
-use smol_str::SmolStr;
 use std::{
     convert::Infallible,
     sync::{
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub branch_id: BranchId,
 }
 
 #[derive(Debug)]

From 8cb8c8d7b50d2b4079f04ef45575b4f96b2c3889 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 24 Jan 2024 19:48:56 +0300
Subject: [PATCH 47/70] Allow remove_wal.rs to run on inactive timelines
 (#6462)

Temporary enable it on staging to help with
https://github.com/neondatabase/neon/issues/6403
Can be also deployed to prod if will work well on staging.
---
 safekeeper/src/remove_wal.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 87c3ecaf60..9dce06a886 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -7,6 +7,8 @@ use tracing::*;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
 
+const ALLOW_INACTIVE_TIMELINES: bool = true;
+
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
     let wal_removal_interval = Duration::from_millis(5000);
     loop {
@@ -15,10 +17,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
         let tlis = GlobalTimelines::get_all();
         for tli in &tlis {
-            if !tli.is_active().await {
+            let is_active = tli.is_active().await;
+            if is_active {
+                active_timelines += 1;
+            }
+            if !ALLOW_INACTIVE_TIMELINES && !is_active {
                 continue;
             }
-            active_timelines += 1;
             let ttid = tli.ttid;
             async {
                 if let Err(e) = tli.maybe_persist_control_file().await {

From b92be77e1962c7de8a9bf06b4b899700bb4ba4d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 24 Jan 2024 21:27:54 +0100
Subject: [PATCH 48/70] Make RemoteStorage not use async_trait (#6464)

Makes the `RemoteStorage` trait not be based on `async_trait` any more.

To avoid recursion in async (not supported by Rust), we made
`GenericRemoteStorage` generic on the "Unreliable" variant. That allows
us to have the unreliable wrapper never contain/call itself.

related earlier work: #6305
---
 libs/remote_storage/src/azure_blob.rs        |  1 -
 libs/remote_storage/src/lib.rs               |  9 +++++----
 libs/remote_storage/src/local_fs.rs          |  1 -
 libs/remote_storage/src/s3_bucket.rs         |  1 -
 libs/remote_storage/src/simulate_failures.rs | 20 ++++++++++++++++----
 5 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 7895a21f66..e3edfd08a5 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -183,7 +183,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
     }
 }
 
-#[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
     async fn list(
         &self,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 942d0016b0..f247fcdc38 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -142,7 +142,7 @@ pub struct Listing {
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
-#[async_trait::async_trait]
+#[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
     /// Lists all top level subdirectories for a given prefix
     /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -262,14 +262,15 @@ impl std::error::Error for DownloadError {}
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
-pub enum GenericRemoteStorage {
+// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
+pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
     LocalFs(LocalFs),
     AwsS3(Arc<S3Bucket>),
     AzureBlob(Arc<AzureBlobStorage>),
-    Unreliable(Arc<UnreliableWrapper>),
+    Unreliable(Other),
 }
 
-impl GenericRemoteStorage {
+impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
     pub async fn list(
         &self,
         prefix: Option<&RemotePath>,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index bf8b6b5dde..5f5bc1a9fa 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -157,7 +157,6 @@ impl LocalFs {
     }
 }
 
-#[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
     async fn list(
         &self,
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index d7b41edaaf..e72cf28228 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -373,7 +373,6 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
     }
 }
 
-#[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     async fn list(
         &self,
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 7f5adcea30..90d90163a7 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,16 +3,17 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
+use std::{collections::hash_map::Entry, sync::Arc};
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
+    StorageMetadata,
 };
 
 pub struct UnreliableWrapper {
-    inner: crate::GenericRemoteStorage,
+    inner: GenericRemoteStorage<Arc<VoidStorage>>,
 
     // This many attempts of each operation will fail, then we let it succeed.
     attempts_to_fail: u64,
@@ -34,6 +35,15 @@ enum RemoteOp {
 impl UnreliableWrapper {
     pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
         assert!(attempts_to_fail > 0);
+        let inner = match inner {
+            GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
+            GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
+            GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
+            // We could also make this a no-op, as in, extract the inner of the passed generic remote storage
+            GenericRemoteStorage::Unreliable(_s) => {
+                panic!("Can't wrap unreliable wrapper unreliably")
+            }
+        };
         UnreliableWrapper {
             inner,
             attempts_to_fail,
@@ -84,7 +94,9 @@ impl UnreliableWrapper {
     }
 }
 
-#[async_trait::async_trait]
+// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
+type VoidStorage = crate::LocalFs;
+
 impl RemoteStorage for UnreliableWrapper {
     async fn list_prefixes(
         &self,

From c9b1657e4ca88f494c0272c8ee259758dadc2953 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 25 Jan 2024 12:35:52 +0000
Subject: [PATCH 49/70] pageserver: fixes for creation operations overlapping
 with shutdown/startup (#6436)

## Problem

For #6423, creating a reproducer turned out to be very easy, as an
extension to test_ondemand_activation.

However, before I had diagnosed the issue, I was starting with a more
brute force approach of running creation API calls in the background
while restarting a pageserver, and that shows up a bunch of other
interesting issues.

In this PR:
- Add the reproducer for #6423 by extending `test_ondemand_activation`
(confirmed that this test fails if I revert the fix from
https://github.com/neondatabase/neon/pull/6430)
- In timeline creation, return 503 responses when we get an error and
the tenant's cancellation token is set: this covers the cases where we
get an anyhow::Error from something during timeline creation as a result
of shutdown.
- While waiting for tenants to become active during creation, don't
.map_err() the result to a 500: instead let the `From` impl map the
result to something appropriate (this includes mapping shutdown to 503)
- During tenant creation, we were calling `Tenant::load_local` because
no Preload object is provided. This is usually harmless because the
tenant dir is empty, but if there are some half-created timelines in
there, bad things can happen. Propagate the SpawnMode into
Tenant::attach, so that it can properly skip _any_ attempt to load
timelines if creating.
- When we call upsert_location, there's a SpawnMode that tells us
whether to load from remote storage or not. But if the operation is a
retry and we already have the tenant, it is not correct to skip loading
from remote storage: there might be a timeline there. This isn't
strictly a correctness issue as long as the caller behaves correctly
(does not assume that any timelines are persistent until the creation is
acked), but it's a more defensive position.
- If we shut down while the task in Tenant::attach is running, it can
end up spawning rogue tasks. Fix this by holding a GateGuard through
here, and in upsert_location shutting down a tenant after calling
tenant_spawn if we can't insert it into tenants_map. This fixes the
expected behavior that after shutdown_all_tenants returns, no tenant
tasks are running.
- Add `test_create_churn_during_restart`, which runs tenant & timeline
creations across pageserver restarts.
- Update a couple of tests that covered cancellation, to reflect the
cleaner errors we now return.
---
 libs/utils/src/http/error.rs                  |   4 +-
 pageserver/src/http/routes.rs                 |  19 +-
 pageserver/src/tenant.rs                      |  35 +++-
 pageserver/src/tenant/delete.rs               |   5 +-
 pageserver/src/tenant/mgr.rs                  | 178 ++++++++++++++----
 test_runner/fixtures/pageserver/http.py       |  11 +-
 .../regress/test_pageserver_secondary.py      |  15 +-
 test_runner/regress/test_tenant_delete.py     |  14 +-
 test_runner/regress/test_tenants.py           |  81 +++++++-
 test_runner/regress/test_timeline_size.py     |  22 ++-
 10 files changed, 307 insertions(+), 77 deletions(-)

diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 3e9281ac81..d55823b0b7 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -131,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
         ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
         ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
         ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
-        _ => error!("Error processing HTTP request: {api_error:#}"),
+        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
+        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+        _ => info!("Error processing HTTP request: {api_error:#}"),
     }
 
     api_error.into_response()
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5e09a5aa1a..aa56806246 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,6 +187,7 @@ impl From<TenantSlotUpsertError> for ApiError {
         match e {
             InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
             MapState(e) => e.into(),
+            ShuttingDown(_) => ApiError::ShuttingDown,
         }
     }
 }
@@ -495,6 +496,10 @@ async fn timeline_create_handler(
                     .map_err(ApiError::InternalServerError)?;
                 json_response(StatusCode::CREATED, timeline_info)
             }
+            Err(_) if tenant.cancel.is_cancelled() => {
+                // In case we get some ugly error type during shutdown, cast it into a clean 503.
+                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
+            }
             Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
                 json_response(StatusCode::CONFLICT, ())
             }
@@ -1257,19 +1262,9 @@ async fn tenant_create_handler(
     };
     // We created the tenant. Existing API semantics are that the tenant
     // is Active when this function returns.
-    if let res @ Err(_) = new_tenant
+    new_tenant
         .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await
-    {
-        // This shouldn't happen because we just created the tenant directory
-        // in upsert_location, and there aren't any remote timelines
-        // to load, so, nothing can really fail during load.
-        // Don't do cleanup because we don't know how we got here.
-        // The tenant will likely be in `Broken` state and subsequent
-        // calls will fail.
-        res.context("created tenant failed to become active")
-            .map_err(ApiError::InternalServerError)?;
-    }
+        .await?;
 
     json_response(
         StatusCode::CREATED,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 31af54d146..7bb5881aab 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -627,9 +627,15 @@ impl Tenant {
             deletion_queue_client,
         ));
 
+        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
+        // we shut down while attaching.
+        let Ok(attach_gate_guard) = tenant.gate.enter() else {
+            // We just created the Tenant: nothing else can have shut it down yet
+            unreachable!();
+        };
+
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
-
         let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
@@ -639,6 +645,8 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                let _gate_guard = attach_gate_guard;
+
                 // Is this tenant being spawned as part of process startup?
                 let starting_up = init_order.is_some();
                 scopeguard::defer! {
@@ -813,7 +821,7 @@ impl Tenant {
                     SpawnMode::Create => None,
                     SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
                 };
-                match tenant_clone.attach(preload, &ctx).await {
+                match tenant_clone.attach(preload, mode, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
                         if let Some(t)=  attach_timer {t.observe_duration();}
@@ -900,15 +908,20 @@ impl Tenant {
     async fn attach(
         self: &Arc<Tenant>,
         preload: Option<TenantPreload>,
+        mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
         failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
-        let preload = match preload {
-            Some(p) => p,
-            None => {
+        let preload = match (preload, mode) {
+            (Some(p), _) => p,
+            (None, SpawnMode::Create) => TenantPreload {
+                deleting: false,
+                timelines: HashMap::new(),
+            },
+            (None, SpawnMode::Normal) => {
                 // Deprecated dev mode: load from local disk state instead of remote storage
                 // https://github.com/neondatabase/neon/issues/5624
                 return self.load_local(ctx).await;
@@ -1683,9 +1696,13 @@ impl Tenant {
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         if !self.is_active() {
-            return Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "Cannot create timelines on inactive tenant"
-            )));
+            if matches!(self.current_state(), TenantState::Stopping { .. }) {
+                return Err(CreateTimelineError::ShuttingDown);
+            } else {
+                return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "Cannot create timelines on inactive tenant"
+                )));
+            }
         }
 
         let _gate = self
@@ -4035,7 +4052,7 @@ pub(crate) mod harness {
                         .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
                     tenant
-                        .attach(Some(preload), ctx)
+                        .attach(Some(preload), SpawnMode::Normal, ctx)
                         .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
                 }
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index ecffd4e6c1..97de0cdcf9 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -409,7 +409,10 @@ impl DeleteTenantFlow {
             .await
             .expect("cant be stopping or broken");
 
-        tenant.attach(preload, ctx).await.context("attach")?;
+        tenant
+            .attach(preload, super::SpawnMode::Normal, ctx)
+            .await
+            .context("attach")?;
 
         Self::background(
             guard,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 84c7a20247..32535e0134 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
@@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
-    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
+    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
+    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
@@ -466,6 +468,26 @@ pub async fn init_tenant_mgr(
             // We have a generation map: treat it as the authority for whether
             // this tenant is really attached.
             if let Some(gen) = generations.get(&tenant_shard_id) {
+                if let LocationMode::Attached(attached) = &location_conf.mode {
+                    if attached.generation > *gen {
+                        tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            attached.generation
+                        );
+
+                        // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
+                        // local disk content: demote to secondary rather than detaching.
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
+                                &SecondaryLocationConfig { warm: false },
+                            )),
+                        );
+                    }
+                }
                 *gen
             } else {
                 match &location_conf.mode {
@@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
         tokio::select! {
             Some(joined) = join_set.join_next() => {
                 match joined {
-                    Ok(()) => {}
+                    Ok(()) => {},
                     Err(join_error) if join_error.is_cancelled() => {
                         unreachable!("we are not cancelling any of the tasks");
                     }
@@ -882,7 +904,7 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
         new_location_config: LocationConf,
         flush: Option<Duration>,
-        spawn_mode: SpawnMode,
+        mut spawn_mode: SpawnMode,
         ctx: &RequestContext,
     ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
         debug_assert_current_span_has_tenant_id();
@@ -902,19 +924,29 @@ impl TenantManager {
                 tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
             match (&new_location_config.mode, peek_slot) {
                 (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                    if attach_conf.generation == tenant.generation {
-                        // A transition from Attached to Attached in the same generation, we may
-                        // take our fast path and just provide the updated configuration
-                        // to the tenant.
-                        tenant.set_new_location_config(
-                            AttachedTenantConf::try_from(new_location_config.clone())
-                                .map_err(UpsertLocationError::BadRequest)?,
-                        );
+                    match attach_conf.generation.cmp(&tenant.generation) {
+                        Ordering::Equal => {
+                            // A transition from Attached to Attached in the same generation, we may
+                            // take our fast path and just provide the updated configuration
+                            // to the tenant.
+                            tenant.set_new_location_config(
+                                AttachedTenantConf::try_from(new_location_config.clone())
+                                    .map_err(UpsertLocationError::BadRequest)?,
+                            );
 
-                        Some(FastPathModified::Attached(tenant.clone()))
-                    } else {
-                        // Different generations, fall through to general case
-                        None
+                            Some(FastPathModified::Attached(tenant.clone()))
+                        }
+                        Ordering::Less => {
+                            return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
+                                "Generation {:?} is less than existing {:?}",
+                                attach_conf.generation,
+                                tenant.generation
+                            )));
+                        }
+                        Ordering::Greater => {
+                            // Generation advanced, fall through to general case of replacing `Tenant` object
+                            None
+                        }
                     }
                 }
                 (
@@ -1019,6 +1051,12 @@ impl TenantManager {
                     }
                 }
                 slot_guard.drop_old_value().expect("We just shut it down");
+
+                // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
+                // the caller thinks they're creating but the tenant already existed.  We must switch to
+                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // rather than assuming it to be empty.
+                spawn_mode = SpawnMode::Normal;
             }
             Some(TenantSlot::Secondary(state)) => {
                 info!("Shutting down secondary tenant");
@@ -1102,14 +1140,46 @@ impl TenantManager {
             None
         };
 
-        slot_guard.upsert(new_slot).map_err(|e| match e {
-            TenantSlotUpsertError::InternalError(e) => {
-                UpsertLocationError::Other(anyhow::anyhow!(e))
+        match slot_guard.upsert(new_slot) {
+            Err(TenantSlotUpsertError::InternalError(e)) => {
+                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
             }
-            TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
-        })?;
+            Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
+            Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
+                // If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
+                // we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
+                // are shutdown.
+                //
+                // We must shut it down inline here.
+                match new_slot {
+                    TenantSlot::InProgress(_) => {
+                        // Unreachable because we never insert an InProgress
+                        unreachable!()
+                    }
+                    TenantSlot::Attached(tenant) => {
+                        let (_guard, progress) = utils::completion::channel();
+                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
+                        match tenant.shutdown(progress, false).await {
+                            Ok(()) => {
+                                info!("Finished shutting down just-spawned tenant");
+                            }
+                            Err(barrier) => {
+                                info!("Shutdown already in progress, waiting for it to complete");
+                                barrier.wait().await;
+                            }
+                        }
+                    }
+                    TenantSlot::Secondary(secondary_tenant) => {
+                        secondary_tenant.shutdown().await;
+                    }
+                }
 
-        Ok(attached_tenant)
+                Err(UpsertLocationError::Unavailable(
+                    TenantMapError::ShuttingDown,
+                ))
+            }
+            Ok(()) => Ok(attached_tenant),
+        }
     }
 
     /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1728,14 +1798,31 @@ pub(crate) enum TenantSlotError {
 
 /// Superset of TenantMapError: issues that can occur when using a SlotGuard
 /// to insert a new value.
-#[derive(Debug, thiserror::Error)]
-pub enum TenantSlotUpsertError {
+#[derive(thiserror::Error)]
+pub(crate) enum TenantSlotUpsertError {
     /// An error where the slot is in an unexpected state, indicating a code bug
     #[error("Internal error updating Tenant")]
     InternalError(Cow<'static, str>),
 
     #[error(transparent)]
-    MapState(#[from] TenantMapError),
+    MapState(TenantMapError),
+
+    // If we encounter TenantManager shutdown during upsert, we must carry the Completion
+    // from the SlotGuard, so that the caller can hold it while they clean up: otherwise
+    // TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
+    // was protected by the SlotGuard.
+    #[error("Shutting down")]
+    ShuttingDown((TenantSlot, utils::completion::Completion)),
+}
+
+impl std::fmt::Debug for TenantSlotUpsertError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
+            Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
+            Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1784,7 +1871,7 @@ pub struct SlotGuard {
 
     /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
     /// release any waiters as soon as this SlotGuard is dropped.
-    _completion: utils::completion::Completion,
+    completion: utils::completion::Completion,
 }
 
 impl SlotGuard {
@@ -1797,7 +1884,7 @@ impl SlotGuard {
             tenant_shard_id,
             old_value,
             upserted: false,
-            _completion: completion,
+            completion,
         }
     }
 
@@ -1830,9 +1917,16 @@ impl SlotGuard {
             }
 
             let m = match &mut *locked {
-                TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
+                TenantsMap::Initializing => {
+                    return Err(TenantSlotUpsertError::MapState(
+                        TenantMapError::StillInitializing,
+                    ))
+                }
                 TenantsMap::ShuttingDown(_) => {
-                    return Err(TenantMapError::ShuttingDown.into());
+                    return Err(TenantSlotUpsertError::ShuttingDown((
+                        new_value,
+                        self.completion.clone(),
+                    )));
                 }
                 TenantsMap::Open(m) => m,
             };
@@ -1880,7 +1974,9 @@ impl SlotGuard {
                 Err(TenantSlotUpsertError::InternalError(_)) => {
                     // We already logged the error, nothing else we can do.
                 }
-                Err(TenantSlotUpsertError::MapState(_)) => {
+                Err(
+                    TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
+                ) => {
                     // If the map is shutting down, we need not replace anything
                 }
                 Ok(()) => {}
@@ -1978,18 +2074,22 @@ fn tenant_map_peek_slot<'a>(
     tenant_shard_id: &TenantShardId,
     mode: TenantSlotPeekMode,
 ) -> Result<Option<&'a TenantSlot>, TenantMapError> {
-    let m = match tenants.deref() {
-        TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
+    match tenants.deref() {
+        TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
         TenantsMap::ShuttingDown(m) => match mode {
-            TenantSlotPeekMode::Read => m,
-            TenantSlotPeekMode::Write => {
-                return Err(TenantMapError::ShuttingDown);
-            }
+            TenantSlotPeekMode::Read => Ok(Some(
+                // When reading in ShuttingDown state, we must translate None results
+                // into a ShuttingDown error, because absence of a tenant shard ID in the map
+                // isn't a reliable indicator of the tenant being gone: it might have been
+                // InProgress when shutdown started, and cleaned up from that state such
+                // that it's now no longer in the map.  Callers will have to wait until
+                // we next start up to get a proper answer.  This avoids incorrect 404 API responses.
+                m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
+            )),
+            TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
         },
-        TenantsMap::Open(m) => m,
-    };
-
-    Ok(m.get(tenant_shard_id))
+        TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
+    }
 }
 
 enum TenantSlotAcquireMode {
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index ddf83b56a0..340cc9e9e3 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -20,6 +20,7 @@ from fixtures.utils import Fn
 class PageserverApiException(Exception):
     def __init__(self, message, status_code: int):
         super().__init__(message)
+        self.message = message
         self.status_code = status_code
 
 
@@ -261,12 +262,18 @@ class PageserverHttpClient(requests.Session):
         )
         self.verbose_error(res)
 
-    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
+    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None):
         params = {}
         if detach_ignored:
             params["detach_ignored"] = "true"
 
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
+        kwargs = {}
+        if timeout_secs is not None:
+            kwargs["timeout"] = timeout_secs
+
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs
+        )
         self.verbose_error(res)
 
     def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 521b96779a..293152dd62 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -5,7 +5,10 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
-from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
+from fixtures.pageserver.utils import (
+    assert_prefix_empty,
+    tenant_delete_wait_completed,
+)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
@@ -135,6 +138,16 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
             pageserver.stop()
             pageserver.start()
             if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
+                # /re-attach call will bump generation: track that in our state in case we do an
+                # "attach in same generation" operation later
+                assert last_state_ps[1] is not None  # latest_attached == pageserfer.id implies this
+                # The re-attach API increments generation by exactly one.
+                new_generation = last_state_ps[1] + 1
+                last_state[pageserver.id] = (last_state_ps[0], new_generation)
+                tenants = pageserver.http_client().tenant_list()
+                assert len(tenants) == 1
+                assert tenants[0]["generation"] == new_generation
+
                 log.info("Entering postgres...")
                 workload.churn_rows(rng.randint(128, 256), pageserver.id)
                 workload.validate(pageserver.id)
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index bafc5711a1..b4e5a550f3 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -411,9 +411,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
     pageserver_http.configure_failpoints((failpoint, "pause"))
 
     def hit_pausable_failpoint_and_later_fail():
-        with pytest.raises(
-            PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn"
-        ):
+        with pytest.raises(PageserverApiException, match="NotFound: tenant"):
             pageserver_http.timeline_create(
                 env.pg_version, env.initial_tenant, env.initial_timeline
             )
@@ -443,8 +441,8 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
     try:
         wait_until(10, 1, has_hit_failpoint)
 
-        # it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated
-        # then deletion should fail and set the tenant broken
+        # it should start ok, sync up with the stuck creation, then hang waiting for the timeline
+        # to shut down.
         deletion = Thread(target=start_deletion)
         deletion.start()
 
@@ -573,9 +571,11 @@ def test_tenant_delete_races_timeline_creation(
     ps_http = env.pageserver.http_client()
     tenant_id = env.initial_tenant
 
-    # Sometimes it ends with "InternalServerError(Cancelled", sometimes with "InternalServerError(Operation was cancelled"
+    # When timeline creation is cancelled by tenant deletion, it is during Tenant::shutdown(), and
+    # acting on a shutdown tenant generates a 503 response (if caller retried they would later) get
+    # a 404 after the tenant is fully deleted.
     CANCELLED_ERROR = (
-        ".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
+        ".*POST.*Cancelled request finished successfully status=503 Service Unavailable"
     )
 
     # This can occur sometimes.
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 2ee2d8125a..5164bda470 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import os
 import time
 from contextlib import closing
@@ -7,6 +8,7 @@ from pathlib import Path
 from typing import List
 
 import pytest
+import requests
 from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
@@ -17,7 +19,9 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
-from fixtures.pageserver.utils import timeline_delete_wait_completed
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn, TenantId
 from fixtures.utils import wait_until
@@ -341,3 +345,78 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
     assert (
         tenant_active_count == 1
     ), f"Tenant {tenant_with_empty_timelines} should have metric as active"
+
+
+def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
+    """
+    Probabilistic stress test for the pageserver's handling of tenant requests
+    across a restart. This is intended to catch things like:
+    - Bad response codes during shutdown (e.g. returning 500 instead of 503)
+    - Issues where a tenant is still starting up while we receive a request for it
+    - Issues with interrupting/resuming tenant/timeline creation in shutdown
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+    tenant_id: TenantId = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Multiple creation requests which race will generate this error
+    env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
+
+    # Tenant creation requests which arrive out of order will generate complaints about
+    # generation nubmers out of order.
+    env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
+
+    # Our multiple creation requests will advance generation quickly, and when we skip
+    # a generation number we can generate these warnings
+    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")
+
+    # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
+    # an incomplete attach, or some other problem.  In the field this should be rare,
+    # so we allow it to log at WARN, even if it is occasionally a false positive.
+    env.pageserver.allowed_errors.append(".*failed to freeze and flush.*")
+
+    # When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait
+    # for it to complete (since https://github.com/neondatabase/neon/pull/6451).  This means
+    # that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run.
+    env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*")
+
+    def create_bg(delay_ms):
+        time.sleep(delay_ms / 1000.0)
+        try:
+            env.pageserver.tenant_create(tenant_id=tenant_id)
+            env.pageserver.http_client().timeline_create(
+                PgVersion.NOT_SET, tenant_id, new_timeline_id=timeline_id
+            )
+        except PageserverApiException as e:
+            if e.status_code == 409:
+                log.info(f"delay_ms={delay_ms} 409")
+                pass
+            elif e.status_code == 400:
+                if "is less than existing" in e.message:
+                    # We send creation requests very close together in time: it is expected that these
+                    # race, and sometimes chigher-generation'd requests arrive first.  The pageserver rightly
+                    # rejects any attempt to make a generation number go backwards.
+                    pass
+                else:
+                    raise
+            else:
+                raise
+        except requests.exceptions.ConnectionError:
+            # Our requests might arrive during shutdown and be cut off at the transport level
+            pass
+
+    for _ in range(0, 10):
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futs = []
+            for delay_ms in (0, 1, 10, 50, 100, 200, 500, 800):
+                f = executor.submit(create_bg, delay_ms)
+                futs.append(f)
+            env.pageserver.stop()
+            env.pageserver.start()
+
+            for f in futs:
+                f.result(timeout=10)
+
+    # The tenant should end up active
+    wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 2e58a413e4..4c5cb32caa 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -868,7 +868,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     )
     assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
 
-    # Check that tenant deletion proactively wakes tenants: this is done separately to the main
+    # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
     # body of the test because it will disrupt tenant counts
     env.pageserver.stop()
     env.pageserver.start(
@@ -876,9 +876,22 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     )
 
     wait_until(10, 1, at_least_one_active)
-    delete_tenant_id = list(
+
+    detach_tenant_id = list(
         [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
     )[0][0]
+    delete_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[1][0]
+
+    # Detaching a stuck tenant should proceed promptly
+    # (reproducer for https://github.com/neondatabase/neon/pull/6430)
+    env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10)
+    tenant_ids.remove(detach_tenant_id)
+    # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level
+    env.pageserver.allowed_errors.append(
+        ".*attach failed, setting tenant state to Broken: Shut down while Attaching"
+    )
 
     # Deleting a stuck tenant should prompt it to go active
     with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -912,9 +925,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
         tenant_ids.remove(delete_tenant_id)
 
-    # Check that all the stuck tenants proceed to active (apart from the one that deletes)
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
+    # we detached)
     wait_until(10, 1, all_active)
-    assert len(get_tenant_states()) == n_tenants - 1
+    assert len(get_tenant_states()) == n_tenants - 2
 
 
 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):

From 463b6a26b5bc1b73898d7bb75a53e3d448def97d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 25 Jan 2024 15:38:28 +0200
Subject: [PATCH 50/70] test: show relative order eviction with "fast growing
 tenant" (#6377)

Refactor out test_disk_usage_eviction tenant creation and add a custom
case with 4 tenants, 3 made with pgbench scale=1 and 1 made with pgbench
scale=4.

Because the tenants are created in order of scales [1, 1, 1, 4] this is
simple enough to demonstrate the problem with using absolute access
times, because on a disk usage based eviction run we will
disproportionally target the *first* scale=1 tenant(s), and the later
larger tenant does not lose anything.

This test is not enough to show the difference between `relative_equal`
and `relative_spare` (the fudge factor); much larger scale will be
needed for "the large tenant", but that will make debug mode tests
slower.

Cc: #5304
---
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 .../regress/test_disk_usage_eviction.py       | 182 +++++++++++++-----
 2 files changed, 142 insertions(+), 44 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index eef8de876e..fd5e77671b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1095,7 +1095,9 @@ class NeonEnv:
         assert that there is only one. Tests with multiple pageservers should always use
         get_pageserver with an explicit ID.
         """
-        assert len(self.pageservers) == 1
+        assert (
+            len(self.pageservers) == 1
+        ), "env.pageserver must only be used with single pageserver NeonEnv"
         return self.pageservers[0]
 
     def get_pageserver(self, id: Optional[int]) -> NeonPageserver:
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 70c3b77516..6a4f0edbea 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -2,7 +2,7 @@ import enum
 import time
 from collections import Counter
 from dataclasses import dataclass
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Iterable, Tuple
 
 import pytest
 import toml
@@ -121,17 +121,7 @@ class EvictionEnv:
         }
 
     def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]:
-        ret: Counter[TenantId] = Counter()
-
-        for tenant_id, timeline_id in self.timelines:
-            timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
-            assert timeline_dir.exists()
-            for file in timeline_dir.iterdir():
-                if "__" not in file.name:
-                    continue
-                ret[tenant_id] += 1
-
-        return dict(ret)
+        return count_layers_per_tenant(pageserver, self.timelines)
 
     def warm_up_tenant(self, tenant_id: TenantId):
         """
@@ -199,6 +189,22 @@ class EvictionEnv:
         wait_until(10, 1, statvfs_called)
 
 
+def count_layers_per_tenant(
+    pageserver: NeonPageserver, timelines: Iterable[Tuple[TenantId, TimelineId]]
+) -> Dict[TenantId, int]:
+    ret: Counter[TenantId] = Counter()
+
+    for tenant_id, timeline_id in timelines:
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        assert timeline_dir.exists()
+        for file in timeline_dir.iterdir():
+            if "__" not in file.name:
+                continue
+            ret[tenant_id] += 1
+
+    return dict(ret)
+
+
 def human_bytes(amt: float) -> str:
     suffixes = ["", "Ki", "Mi", "Gi"]
 
@@ -243,21 +249,7 @@ def _eviction_env(
 
     timelines = []
     for scale in pgbench_scales:
-        tenant_id, timeline_id = env.neon_cli.create_tenant(
-            conf={
-                "gc_period": "0s",
-                "compaction_period": "0s",
-                "checkpoint_distance": f"{layer_size}",
-                "image_creation_threshold": "100",
-                "compaction_target_size": f"{layer_size}",
-            }
-        )
-
-        with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-            pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-        timelines.append((tenant_id, timeline_id))
+        timelines.append(pgbench_init_tenant(layer_size, scale, env, pg_bin))
 
     # stop the safekeepers to avoid on-demand downloads caused by
     # initial logical size calculation triggered by walreceiver connection status
@@ -266,25 +258,13 @@ def _eviction_env(
 
     # after stopping the safekeepers, we know that no new WAL will be coming in
     for tenant_id, timeline_id in timelines:
-        pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
-
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-        tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
-        assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
-        assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
-        pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"])
-
-        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
-        log.info(f"{layers}")
-        assert (
-            len(layers.historic_layers) >= 10
-        ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+        pgbench_init_lsns[tenant_id] = finish_tenant_creation(env, tenant_id, timeline_id, 10)
 
     eviction_env = EvictionEnv(
         timelines=timelines,
         neon_env=env,
-        pageserver_http=pageserver_http,
+        # this last tenant http client works for num_pageservers=1
+        pageserver_http=env.get_tenant_pageserver(timelines[-1][0]).http_client(),
         layer_size=layer_size,
         pg_bin=pg_bin,
         pgbench_init_lsns=pgbench_init_lsns,
@@ -293,6 +273,49 @@ def _eviction_env(
     return eviction_env
 
 
+def pgbench_init_tenant(
+    layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin
+) -> Tuple[TenantId, TimelineId]:
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{layer_size}",
+            "image_creation_threshold": "100",
+            "compaction_target_size": f"{layer_size}",
+        }
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    return (tenant_id, timeline_id)
+
+
+def finish_tenant_creation(
+    env: NeonEnv,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    min_expected_layers: int,
+) -> Lsn:
+    pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+    tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
+    assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
+    assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
+    pgbench_init_lsn = Lsn(tl_info["last_record_lsn"])
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    # log.info(f"{layers}")
+    assert (
+        len(layers.historic_layers) >= min_expected_layers
+    ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+
+    return pgbench_init_lsn
+
+
 @pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
     return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1)
@@ -598,9 +621,82 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
             assert abs_diff < 0.05
 
 
+@pytest.mark.parametrize(
+    "order",
+    [
+        EvictionOrder.ABSOLUTE_ORDER,
+        EvictionOrder.RELATIVE_ORDER_EQUAL,
+        EvictionOrder.RELATIVE_ORDER_SPARE,
+    ],
+)
+def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, order: EvictionOrder):
+    """
+    Create in order first smaller tenants and finally a single larger tenant.
+    Assert that with relative order modes, the disk usage based eviction is
+    more fair towards the smaller tenants.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
+
+    # initial_tenant and initial_timeline do not exist
+
+    # create N tenants the same fashion as EvictionEnv
+    layer_size = 5 * 1024**2
+    timelines = []
+    for scale in [1, 1, 1, 4]:
+        timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))
+
+    env.neon_cli.safekeeper_stop()
+
+    for (tenant_id, timeline_id), scale in timelines:
+        min_expected_layers = 4 if scale == 1 else 10
+        finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers)
+
+    tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False)
+
+    # cut 10 percent
+    response = env.pageserver.http_client().disk_usage_eviction_run(
+        {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()}
+    )
+    log.info(f"{response}")
+
+    after_tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+
+    ratios = []
+    for i, ((tenant_id, _timeline_id), _scale) in enumerate(timelines):
+        # we expect the oldest to suffer most
+        originally, after = tenant_layers[tenant_id], after_tenant_layers[tenant_id]
+        log.info(f"{i + 1}th tenant went from {originally} -> {after}")
+        ratio = after / originally
+        ratios.append(ratio)
+
+    assert (
+        len(ratios) == 4
+    ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
+    log.info(f"{ratios}")
+
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # first tenant loses most
+        assert ratios[0] <= ratios[1], "first should lose the most"
+        assert ratios[1] < ratios[2], "second should lose some"
+        assert ratios[1] < 1.0
+        assert ratios[2] <= ratios[3], "third might not lose"
+        assert ratios[3] == 1.0, "tenant created last does not lose"
+    elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
+        assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
+    elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
+        # with different layer sizes and pg versions, there are different combinations
+        assert len([x for x in ratios if x < 1.0]) >= 2, "require 2..4 tenants to lose layers"
+        assert ratios[3] < 1.0, "largest tenant always loses layers"
+    else:
+        raise RuntimeError(f"unimplemented {order}")
+
+
 def poor_mans_du(
     env: NeonEnv,
-    timelines: list[Tuple[TenantId, TimelineId]],
+    timelines: Iterable[Tuple[TenantId, TimelineId]],
     pageserver: NeonPageserver,
     verbose: bool = False,
 ) -> Tuple[int, int, int]:

From 19ed23070878bd24cbd9c0eceb10fd815b0abc72 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 25 Jan 2024 15:53:31 +0200
Subject: [PATCH 51/70] Add support for PS sharding in compute (#6205)

refer #5508

replaces #5837

## Problem

This PR implements sharding support at compute side. Relations are
splinted in stripes and `get_page` requests are redirected to the
particular shard where stripe is located. All other requests (i.e. get
relation or database size) are always send to shard 0.

## Summary of changes

Support of sharding at compute side include three things:
1. Make it possible to specify and change in runtime connection to more
retain one page server
2. Send `get_page` request to the particular shard (determined by hash
of page key)
3. Support multiple servers in prefetch ring requests

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: John Spray <john@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/libpagestore.c     | 433 ++++++++++++++++++++++++-----------
 pgxn/neon/pagestore_client.h |  17 +-
 pgxn/neon/pagestore_smgr.c   |  80 ++++++-
 3 files changed, 381 insertions(+), 149 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 894c9729f2..0eb1acbfb0 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/xlog.h"
+#include "common/hashfn.h"
 #include "fmgr.h"
 #include "libpq-fe.h"
 #include "libpq/libpq.h"
@@ -38,17 +39,6 @@
 #define MIN_RECONNECT_INTERVAL_USEC 1000
 #define MAX_RECONNECT_INTERVAL_USEC 1000000
 
-bool		connected = false;
-PGconn	   *pageserver_conn = NULL;
-
-/*
- * WaitEventSet containing:
- * - WL_SOCKET_READABLE on pageserver_conn,
- * - WL_LATCH_SET on MyLatch, and
- * - WL_EXIT_ON_PM_DEATH.
- */
-WaitEventSet *pageserver_conn_wes = NULL;
-
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
@@ -59,17 +49,25 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-static int n_reconnect_attempts = 0;
-static int max_reconnect_attempts = 60;
+static int	n_reconnect_attempts = 0;
+static int	max_reconnect_attempts = 60;
+static int	stripe_size;
 
-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+typedef struct
+{
+	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
+	size_t		num_shards;
+} ShardMap;
 
 /*
+ * PagestoreShmemState is kept in shared memory. It contains the connection
+ * strings for each shard.
+ *
  * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
  * allowing it to be changed using pg_reload_conf(). The control plane can
  * update the connection string if the pageserver crashes, is relocated, or
- * new shards are added. A copy of the current value of the GUC is kept in
- * shared memory, updated by the postmaster, because regular backends don't
+ * new shards are added. A parsed copy of the current value of the GUC is kept
+ * in shared memory, updated by the postmaster, because regular backends don't
  * reload the config during query execution, but we might need to re-establish
  * the pageserver connection with the new connection string even in the middle
  * of a query.
@@ -84,7 +82,7 @@ typedef struct
 {
 	pg_atomic_uint64 begin_update_counter;
 	pg_atomic_uint64 end_update_counter;
-	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+	ShardMap	shard_map;
 } PagestoreShmemState;
 
 #if PG_VERSION_NUM >= 150000
@@ -94,10 +92,25 @@ static void walproposer_shmem_request(void);
 static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
+/* This backend's per-shard connections */
+typedef struct
+{
+	PGconn	   *conn;
+
+	/*---
+	 * WaitEventSet containing:
+	 * - WL_SOCKET_READABLE on 'conn'
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet *wes;
+} PageServer;
+
+static PageServer page_servers[MAX_SHARDS];
+
+static bool pageserver_flush(shardno_t shard_no);
+static void pageserver_disconnect(shardno_t shard_no);
 
 static bool
 PagestoreShmemIsValid(void)
@@ -105,89 +118,213 @@ PagestoreShmemIsValid(void)
 	return pagestore_shared && UsedShmemSegAddr;
 }
 
+/*
+ * Parse a comma-separated list of connection strings into a ShardMap.
+ *
+ * If 'result' is NULL, just checks that the input is valid. If the input is
+ * not valid, returns false. The contents of *result are undefined in
+ * that case, and must not be relied on.
+ */
+static bool
+ParseShardMap(const char *connstr, ShardMap *result)
+{
+	const char *p;
+	int			nshards = 0;
+
+	if (result)
+		memset(result, 0, sizeof(ShardMap));
+
+	p = connstr;
+	nshards = 0;
+	for (;;)
+	{
+		const char *sep;
+		size_t		connstr_len;
+
+		sep = strchr(p, ',');
+		connstr_len = sep != NULL ? sep - p : strlen(p);
+
+		if (connstr_len == 0 && sep == NULL)
+			break;				/* ignore trailing comma */
+
+		if (nshards >= MAX_SHARDS)
+		{
+			neon_log(LOG, "Too many shards");
+			return false;
+		}
+		if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE)
+		{
+			neon_log(LOG, "Connection string too long");
+			return false;
+		}
+		if (result)
+		{
+			memcpy(result->connstring[nshards], p, connstr_len);
+			result->connstring[nshards][connstr_len] = '\0';
+		}
+		nshards++;
+
+		if (sep == NULL)
+			break;
+		p = sep + 1;
+	}
+	if (result)
+		result->num_shards = nshards;
+
+	return true;
+}
+
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
-	return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+	char	   *p = *newval;
+
+	return ParseShardMap(p, NULL);
 }
 
 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
+	ShardMap	shard_map;
+
 	/*
 	 * Only postmaster updates the copy in shared memory.
 	 */
 	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
 		return;
 
-	pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
-	pg_write_barrier();
-	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_write_barrier();
-	pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
-}
-
-static bool
-CheckConnstringUpdated(void)
-{
-	if (!PagestoreShmemIsValid())
-		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+	if (!ParseShardMap(newval, &shard_map))
+	{
+		/*
+		 * shouldn't happen, because we already checked the value in
+		 * CheckPageserverConnstring
+		 */
+		elog(ERROR, "could not parse shard map");
+	}
+
+	if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0)
+	{
+		pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
+		pg_write_barrier();
+		memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap));
+		pg_write_barrier();
+		pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
+	}
+	else
+	{
+		/* no change */
+	}
 }
 
+/*
+ * Get the current number of shards, and/or the connection string for a
+ * particular shard from the shard map in shared memory.
+ *
+ * If num_shards_p is not NULL, it is set to the current number of shards.
+ *
+ * If connstr_p is not NULL, the connection string for 'shard_no' is copied to
+ * it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes
+ * long.
+ *
+ * As a side-effect, if the shard map in shared memory had changed since the
+ * last call, terminates all existing connections to all pageservers.
+ */
 static void
-ReloadConnstring(void)
+load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 {
 	uint64		begin_update_counter;
 	uint64		end_update_counter;
-
-	if (!PagestoreShmemIsValid())
-		return;
+	ShardMap   *shard_map = &pagestore_shared->shard_map;
+	shardno_t	num_shards;
 
 	/*
-	 * Copy the current settnig from shared to local memory. Postmaster can
-	 * update the value concurrently, in which case we would copy a garbled
-	 * mix of the old and new values. We will detect it because the counter's
-	 * won't match, and retry. But it's important that we don't do anything
-	 * within the retry-loop that would depend on the string having valid
-	 * contents.
+	 * Postmaster can update the shared memory values concurrently, in which
+	 * case we would copy a garbled mix of the old and new values. We will
+	 * detect it because the counter's won't match, and retry. But it's
+	 * important that we don't do anything within the retry-loop that would
+	 * depend on the string having valid contents.
 	 */
 	do
 	{
 		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
 		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
-		pg_read_barrier();
 
-		strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-		pg_read_barrier();
+		num_shards = shard_map->num_shards;
+		if (connstr_p && shard_no < MAX_SHARDS)
+			strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
+		pg_memory_barrier();
 	}
 	while (begin_update_counter != end_update_counter
 		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
 		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
 
-	pagestore_local_counter = end_update_counter;
+	if (connstr_p && shard_no >= num_shards)
+		neon_log(ERROR, "Shard %d is greater or equal than number of shards %d",
+				 shard_no, num_shards);
+
+	/*
+	 * If any of the connection strings changed, reset all connections.
+	 */
+	if (pagestore_local_counter != end_update_counter)
+	{
+		for (shardno_t i = 0; i < MAX_SHARDS; i++)
+		{
+			if (page_servers[i].conn)
+				pageserver_disconnect(i);
+		}
+		pagestore_local_counter = end_update_counter;
+	}
+
+	if (num_shards_p)
+		*num_shards_p = num_shards;
+}
+
+#define MB (1024*1024)
+
+shardno_t
+get_shard_number(BufferTag *tag)
+{
+	shardno_t	n_shards;
+	uint32		hash;
+
+	load_shard_map(0, NULL, &n_shards);
+
+#if PG_MAJORVERSION_NUM < 16
+	hash = murmurhash32(tag->rnode.relNode);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#else
+	hash = murmurhash32(tag->relNumber);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#endif
+
+	return hash % n_shards;
 }
 
 static bool
-pageserver_connect(int elevel)
+pageserver_connect(shardno_t shard_no, int elevel)
 {
 	char	   *query;
 	int			ret;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
+	PGconn	   *conn;
+	WaitEventSet *wes;
+	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
 
 	static TimestampTz last_connect_time = 0;
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
 	uint64_t	us_since_last_connect;
 
-	Assert(!connected);
+	Assert(page_servers[shard_no].conn == NULL);
 
-	if (CheckConnstringUpdated())
-	{
-		ReloadConnstring();
-	}
+	/*
+	 * Get the connection string for this shard. If the shard map has been
+	 * updated since we last looked, this will also disconnect any existing
+	 * pageserver connections as a side effect.
+	 */
+	load_shard_map(shard_no, connstr, NULL);
 
 	now = GetCurrentTimestamp();
 	us_since_last_connect = now - last_connect_time;
@@ -223,76 +360,84 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = connstr;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
 	n++;
-	pageserver_conn = PQconnectdbParams(keywords, values, 1);
+	conn = PQconnectdbParams(keywords, values, 1);
 
-	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (PQstatus(conn) == CONNECTION_BAD)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		char	   *msg = pchomp(PQerrorMessage(conn));
 
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
+		PQfinish(conn);
 
 		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "could not establish connection to pageserver"),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
 				 errdetail_internal("%s", msg)));
+		pfree(msg);
 		return false;
 	}
-
 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-	ret = PQsendQuery(pageserver_conn, query);
+	ret = PQsendQuery(conn, query);
+	pfree(query);
 	if (ret != 1)
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		neon_log(elevel, "could not send pagestream command to pageserver");
+		PQfinish(conn);
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
 		return false;
 	}
 
-	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
 
-	while (PQisBusy(pageserver_conn))
+	PG_TRY();
 	{
-		WaitEvent	event;
-
-		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-
-		CHECK_FOR_INTERRUPTS();
-
-		/* Data available in socket? */
-		if (event.events & WL_SOCKET_READABLE)
+		while (PQisBusy(conn))
 		{
-			if (!PQconsumeInput(pageserver_conn))
+			WaitEvent	event;
+
+			/* Sleep until there's something to do */
+			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			ResetLatch(MyLatch);
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* Data available in socket? */
+			if (event.events & WL_SOCKET_READABLE)
 			{
-				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				if (!PQconsumeInput(conn))
+				{
+					char	   *msg = pchomp(PQerrorMessage(conn));
 
-				PQfinish(pageserver_conn);
-				pageserver_conn = NULL;
-				FreeWaitEventSet(pageserver_conn_wes);
-				pageserver_conn_wes = NULL;
+					PQfinish(conn);
+					FreeWaitEventSet(wes);
 
-				neon_log(elevel, "could not complete handshake with pageserver: %s",
-						 msg);
-				return false;
+					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
+								   msg);
+					return false;
+				}
 			}
 		}
 	}
+	PG_CATCH();
+	{
+		PQfinish(conn);
+		FreeWaitEventSet(wes);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 
-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;
 
-	connected = true;
 	return true;
 }
 
@@ -300,9 +445,10 @@ pageserver_connect(int elevel)
  * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
  */
 static int
-call_PQgetCopyData(char **buffer)
+call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -312,7 +458,7 @@ retry:
 		WaitEvent	event;
 
 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);
 
 		CHECK_FOR_INTERRUPTS();
@@ -324,7 +470,7 @@ retry:
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
 			}
@@ -338,7 +484,7 @@ retry:
 
 
 static void
-pageserver_disconnect(void)
+pageserver_disconnect(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -347,38 +493,38 @@ pageserver_disconnect(void)
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
 	 */
-	if (connected)
+	if (page_servers[shard_no].conn)
 	{
-		neon_log(LOG, "dropping connection to page server due to error");
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;
 
+		/*
+		 * If the connection to any pageserver is lost, we throw away the
+		 * whole prefetch queue, even for other pageservers. It should not
+		 * cause big problems, because connection loss is supposed to be a
+		 * rare event.
+		 */
 		prefetch_on_ps_disconnect();
 	}
-	if (pageserver_conn_wes != NULL)
+	if (page_servers[shard_no].wes != NULL)
 	{
-		FreeWaitEventSet(pageserver_conn_wes);
-		pageserver_conn_wes = NULL;
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
 	}
 }
 
 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-
-	if (CheckConnstringUpdated())
-	{
-		pageserver_disconnect();
-		ReloadConnstring();
-	}
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
 	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
+		pageserver_disconnect(shard_no);
 	}
 
 	req_buff = nm_pack_request(request);
@@ -392,9 +538,9 @@ pageserver_send(NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
@@ -402,6 +548,8 @@ pageserver_send(NeonRequest *request)
 		n_reconnect_attempts = 0;
 	}
 
+	pageserver_conn = page_servers[shard_no].conn;
+
 	/*
 	 * Send request.
 	 *
@@ -414,8 +562,8 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-		pageserver_disconnect();
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -427,19 +575,20 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);
 
-		neon_log(PageStoreTrace, "sent request: %s", msg);
+		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
 	return true;
 }
 
 static NeonResponse *
-pageserver_receive(void)
+pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
-	if (!connected)
+	if (!pageserver_conn)
 		return NULL;
 
 	PG_TRY();
@@ -447,7 +596,7 @@ pageserver_receive(void)
 		/* read response */
 		int			rc;
 
-		rc = call_PQgetCopyData(&resp_buff.data);
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
 		if (rc >= 0)
 		{
 			resp_buff.len = rc;
@@ -459,33 +608,33 @@ pageserver_receive(void)
 			{
 				char	   *msg = nm_to_string((NeonMessage *) resp);
 
-				neon_log(PageStoreTrace, "got response: %s", msg);
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
 				pfree(msg);
 			}
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
-			pageserver_disconnect();
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			pageserver_disconnect(shard_no);
 			resp = NULL;
 		}
 		else if (rc == -2)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
 		else
 		{
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		pageserver_disconnect(shard_no);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -495,11 +644,13 @@ pageserver_receive(void)
 
 
 static bool
-pageserver_flush(void)
+pageserver_flush(shardno_t shard_no)
 {
-	if (!connected)
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+
+	if (!pageserver_conn)
 	{
-		neon_log(WARNING, "Tried to flush while disconnected");
+		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
 	else
 	{
@@ -507,8 +658,8 @@ pageserver_flush(void)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-			pageserver_disconnect();
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
 			return false;
 		}
@@ -550,6 +701,7 @@ PagestoreShmemInit(void)
 	{
 		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
 		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
+		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -624,6 +776,15 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);
 
+	DefineCustomIntVariable("neon.stripe_size",
+							"sharding stripe size",
+							NULL,
+							&stripe_size,
+							32768, 1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_BLOCKS,
+							NULL, NULL, NULL);
+
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 3fcaab0bee..8c02f357bc 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -20,9 +20,13 @@
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 #include "storage/block.h"
+#include "storage/buf_internals.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 
+#define MAX_SHARDS 128
+#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -51,6 +55,9 @@ typedef struct
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
 										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
 
 /*
  * supertype of all the Neon*Request structs below
@@ -141,11 +148,13 @@ extern char *nm_to_string(NeonMessage *msg);
  * API
  */
 
+typedef unsigned shardno_t;
+
 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
-	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	NeonResponse *(*receive) (shardno_t shard_no);
+	bool		(*flush) (shardno_t shard_no);
 } page_server_api;
 
 extern void prefetch_on_ps_disconnect(void);
@@ -159,6 +168,8 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 
+extern shardno_t get_shard_number(BufferTag* tag);
+
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 0db093e5a7..1fa802e6f4 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -172,6 +172,7 @@ typedef struct PrefetchRequest
 	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
+	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;
 
@@ -239,10 +240,17 @@ typedef struct PrefetchState
 								 * also unused */
 
 	/* the buffers */
-	prfh_hash  *prf_hash;
+	prfh_hash	*prf_hash;
+	int			max_shard_no;
+	/* Mark shards involved in prefetch */
+	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
+#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
+#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
+#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
+
 static PrefetchState *MyPState;
 
 #define GetPrfSlot(ring_index) ( \
@@ -327,6 +335,7 @@ compact_prefetch_buffers(void)
 		Assert(target_slot->status == PRFS_UNUSED);
 
 		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
 		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -494,6 +503,23 @@ prefetch_cleanup_trailing_unused(void)
 	}
 }
 
+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
 /*
  * Wait for slot of ring_index to have received its response.
  * The caller is responsible for making sure the request buffer is flushed.
@@ -509,7 +535,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
@@ -547,7 +573,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->my_ring_index == MyPState->ring_receive);
 
 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive();
+	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
@@ -704,12 +730,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
 
-	while (!page_server->send((NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
 	MyPState->n_unused -= 1;
 	MyPState->ring_unused += 1;
+	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
 
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
@@ -880,6 +908,7 @@ Retry:
 	 * function reads the buffer tag from the slot.
 	 */
 	slot->buftag = tag;
+	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
 	prefetch_do_request(slot, force_latest, force_lsn);
@@ -890,7 +919,7 @@ Retry:
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 		{
 			/*
 			 * Prefetch set is reset in case of error, so we should try to
@@ -908,13 +937,44 @@ static NeonResponse *
 page_server_request(void const *req)
 {
 	NeonResponse *resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (((NeonRequest *) req)->tag)
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+	}
+	shard_no = get_shard_number(&tag);
+
+
+	/*
+	 * Current sharding model assumes that all metadata is present only at shard 0.
+	 * We still need to call get_shard_no() to check if shard map is up-to-date.
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	{
+		shard_no = 0;
+	}
 
 	do
 	{
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
-		MyPState->ring_flush = MyPState->ring_unused;
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
 		consume_prefetch_responses();
-		resp = page_server->receive();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 	return resp;
 
@@ -2098,8 +2158,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							blkno,
+					 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),

From 8dee9908f83fdebea1dfd36304272bdbe684ad5c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 25 Jan 2024 18:45:17 +0200
Subject: [PATCH 52/70] fix(compaction_task): wrong log levels (#6442)

Filter what we log on compaction task. Per discussion in last triage
call, fixing these by introducing and inspecting the root cause within
anyhow::Error instead of rolling out proper conversions.

Fixes: #6365
Fixes: #6367
---
 pageserver/src/tenant/tasks.rs        | 60 ++++++++++++++++++++++++++-
 pageserver/src/tenant/timeline.rs     | 16 ++++++-
 pageserver/src/tenant/upload_queue.rs | 35 ++++++++++++----
 3 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 2b2fcc7711..5f39c46a84 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     );
                     error_run_count += 1;
                     let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    log_compaction_error(
+                        &e,
+                        error_run_count,
+                        &wait_duration,
+                        cancel.is_cancelled(),
                     );
                     wait_duration
                 } else {
@@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }
 
+fn log_compaction_error(
+    e: &CompactionError,
+    error_run_count: u32,
+    sleep_duration: &std::time::Duration,
+    task_cancelled: bool,
+) {
+    use crate::tenant::upload_queue::NotInitialized;
+    use crate::tenant::PageReconstructError;
+    use CompactionError::*;
+
+    enum LooksLike {
+        Info,
+        Error,
+    }
+
+    let decision = match e {
+        ShuttingDown => None,
+        _ if task_cancelled => Some(LooksLike::Info),
+        Other(e) => {
+            let root_cause = e.root_cause();
+
+            let is_stopping = {
+                let upload_queue = root_cause
+                    .downcast_ref::<NotInitialized>()
+                    .is_some_and(|e| e.is_stopping());
+
+                let timeline = root_cause
+                    .downcast_ref::<PageReconstructError>()
+                    .is_some_and(|e| e.is_stopping());
+
+                upload_queue || timeline
+            };
+
+            if is_stopping {
+                Some(LooksLike::Info)
+            } else {
+                Some(LooksLike::Error)
+            }
+        }
+    };
+
+    match decision {
+        Some(LooksLike::Info) => info!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
+        ),
+        Some(LooksLike::Error) => error!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
+        ),
+        None => {}
+    }
+}
+
 ///
 /// GC task's main loop
 ///
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 666dae94e2..c21fe94d01 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -392,8 +392,7 @@ pub(crate) enum PageReconstructError {
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
-    /// The operation was cancelled
-    #[error("Cancelled")]
+    #[error("timeline shutting down")]
     Cancelled,
 
     /// The ancestor of this is being stopped
@@ -405,6 +404,19 @@ pub(crate) enum PageReconstructError {
     WalRedo(anyhow::Error),
 }
 
+impl PageReconstructError {
+    /// Returns true if this error indicates a tenant/timeline shutdown alike situation
+    pub(crate) fn is_stopping(&self) -> bool {
+        use PageReconstructError::*;
+        match self {
+            Other(_) => false,
+            AncestorLsnTimeout(_) => false,
+            Cancelled | AncestorStopping(_) => true,
+            WalRedo(_) => false,
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 enum CreateImageLayersError {
     #[error("timeline shutting down")]
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 32f14f40c5..0b61bc0a10 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotInitialized {
+    #[error("queue is in state Uninitialized")]
+    Uninitialized,
+    #[error("queue is in state Stopping")]
+    Stopped,
+    #[error("queue is shutting down")]
+    ShuttingDown,
+}
+
+impl NotInitialized {
+    pub(crate) fn is_stopping(&self) -> bool {
+        use NotInitialized::*;
+        match self {
+            Uninitialized => false,
+            Stopped => true,
+            ShuttingDown => true,
+        }
+    }
+}
+
 impl UploadQueue {
     pub(crate) fn initialize_empty_remote(
         &mut self,
@@ -214,17 +235,17 @@ impl UploadQueue {
     }
 
     pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        use UploadQueue::*;
         match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => {
-                if !x.shutting_down {
-                    Ok(x)
+            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Initialized(x) => {
+                if x.shutting_down {
+                    Err(NotInitialized::ShuttingDown.into())
                 } else {
-                    anyhow::bail!("queue is shutting down")
+                    Ok(x)
                 }
             }
+            Stopped(_) => Err(NotInitialized::Stopped.into()),
         }
     }
 

From d52b81340f77f37a2cad3237cfb553d5257d634e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 25 Jan 2024 18:23:18 +0100
Subject: [PATCH 53/70] S3 based recovery (#6155)

Adds a new `time_travel_recover` function to the `RemoteStorage` trait
that allows time travel like functionality for S3 buckets, regardless of
their content (it is not even pageserver related). It takes a different
approach from [this
post](https://aws.amazon.com/blogs/storage/point-in-time-restore-for-amazon-s3-buckets/)
that is more complicated.

It takes as input a prefix a target timestamp, and a limit timestamp:

* executes [`ListObjectVersions`](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html)
* obtains the latest version that comes before the target timestamp
* copies that latest version to the same prefix
* if there is versions newer than the limit timestamp, it doesn't do
anything for the file

The limit timestamp is meant to be some timestamp before the start of
the recovery operation and after any changes that one wants to revert.
For example, it might be the time point after a tenant was detached from
all involved pageservers. The limiting mechanism ensures that the
operation is idempotent and can be retried without causing additional
writes/copies.

The approach fulfills all the requirements laid out in 8233, and is a
recoverable operation. Nothing is deleted permanently, only new entries
added to the version log.

I also enable [nextest retries](https://nexte.st/book/retries.html) to
help with some general S3 flakiness (on top of low level retries).

Part of https://github.com/neondatabase/cloud/issues/8233
---
 .github/workflows/build_and_test.yml         |   1 +
 libs/remote_storage/src/azure_blob.rs        |  16 +
 libs/remote_storage/src/lib.rs               |  38 +++
 libs/remote_storage/src/local_fs.rs          |  15 +-
 libs/remote_storage/src/s3_bucket.rs         | 297 +++++++++++++++----
 libs/remote_storage/src/s3_bucket/metrics.rs |   8 +-
 libs/remote_storage/src/simulate_failures.rs |  16 +
 libs/remote_storage/tests/test_real_s3.rs    | 123 +++++++-
 8 files changed, 452 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2b88f09b3d..fb88d4da96 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,6 +21,7 @@ env:
   COPT: '-Werror'
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  NEXTEST_RETRIES: 3
 
 jobs:
   check-permissions:
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index e3edfd08a5..abab32470b 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,6 +8,7 @@ use std::pin::Pin;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;
 
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
@@ -23,6 +24,7 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::{StatusCode, Url};
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
 use crate::s3_bucket::RequestKind;
@@ -370,6 +372,20 @@ impl RemoteStorage for AzureBlobStorage {
             copy_status = status;
         }
     }
+
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        // TODO use Azure point in time recovery feature for this
+        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
+        Err(anyhow::anyhow!(
+            "time travel recovery for azure blob storage is not implemented"
+        ))
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index f247fcdc38..bf9c51ad1a 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -25,6 +25,7 @@ use bytes::Bytes;
 use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;
 use toml_edit::Item;
 use tracing::info;
 
@@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static {
 
     /// Copy a remote object inside a bucket from one path to another.
     async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
+
+    /// Resets the content of everything with the given prefix to the given state
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()>;
 }
 
 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -387,6 +397,33 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
             Self::Unreliable(s) => s.copy(from, to).await,
         }
     }
+
+    pub async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AwsS3(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AzureBlob(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::Unreliable(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -674,6 +711,7 @@ impl ConcurrencyLimiter {
             RequestKind::List => &self.read,
             RequestKind::Delete => &self.write,
             RequestKind::Copy => &self.write,
+            RequestKind::TimeTravel => &self.write,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 5f5bc1a9fa..34a6658a69 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,7 +4,7 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.
 
-use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
 
 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -14,7 +14,7 @@ use tokio::{
     fs,
     io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
-use tokio_util::io::ReaderStream;
+use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
@@ -422,6 +422,17 @@ impl RemoteStorage for LocalFs {
         })?;
         Ok(())
     }
+
+    #[allow(clippy::diverging_sub_expression)]
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        unimplemented!()
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index e72cf28228..4909b8522b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -6,12 +6,14 @@
 
 use std::{
     borrow::Cow,
+    collections::HashMap,
     pin::Pin,
     sync::Arc,
     task::{Context, Poll},
+    time::SystemTime,
 };
 
-use anyhow::Context as _;
+use anyhow::{anyhow, Context as _};
 use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
     imds::credentials::ImdsCredentialsProvider,
@@ -27,17 +29,19 @@ use aws_sdk_s3::{
     config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
-    types::{Delete, ObjectIdentifier},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
     Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 
-use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
+use aws_smithy_types::{body::SdkBody, DateTime};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
+use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
@@ -270,6 +274,59 @@ impl S3Bucket {
             }
         }
     }
+
+    async fn delete_oids(
+        &self,
+        kind: RequestKind,
+        delete_objects: &[ObjectIdentifier],
+    ) -> anyhow::Result<()> {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+            let started_at = start_measuring_requests(kind);
+
+            let resp = self
+                .client
+                .delete_objects()
+                .bucket(self.bucket_name.clone())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
+                .send()
+                .await;
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
+            let resp = resp?;
+            metrics::BUCKET_METRICS
+                .deleted_objects_total
+                .inc_by(chunk.len() as u64);
+            if let Some(errors) = resp.errors {
+                // Log a bounded number of the errors within the response:
+                // these requests can carry 1000 keys so logging each one
+                // would be too verbose, especially as errors may lead us
+                // to retry repeatedly.
+                const LOG_UP_TO_N_ERRORS: usize = 10;
+                for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                    tracing::warn!(
+                        "DeleteObjects key {} failed: {}: {}",
+                        e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                    );
+                }
+
+                return Err(anyhow::format_err!(
+                    "Failed to delete {} objects",
+                    errors.len()
+                ));
+            }
+        }
+        Ok(())
+    }
 }
 
 pin_project_lite::pin_project! {
@@ -568,64 +625,168 @@ impl RemoteStorage for S3Bucket {
             delete_objects.push(obj_id);
         }
 
-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
-            let started_at = start_measuring_requests(kind);
-
-            let resp = self
-                .client
-                .delete_objects()
-                .bucket(self.bucket_name.clone())
-                .delete(
-                    Delete::builder()
-                        .set_objects(Some(chunk.to_vec()))
-                        .build()?,
-                )
-                .send()
-                .await;
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
-            match resp {
-                Ok(resp) => {
-                    metrics::BUCKET_METRICS
-                        .deleted_objects_total
-                        .inc_by(chunk.len() as u64);
-                    if let Some(errors) = resp.errors {
-                        // Log a bounded number of the errors within the response:
-                        // these requests can carry 1000 keys so logging each one
-                        // would be too verbose, especially as errors may lead us
-                        // to retry repeatedly.
-                        const LOG_UP_TO_N_ERRORS: usize = 10;
-                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
-                            tracing::warn!(
-                                "DeleteObjects key {} failed: {}: {}",
-                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
-                            );
-                        }
-
-                        return Err(anyhow::format_err!(
-                            "Failed to delete {} objects",
-                            errors.len()
-                        ));
-                    }
-                }
-                Err(e) => {
-                    return Err(e.into());
-                }
-            }
-        }
-        Ok(())
+        self.delete_oids(kind, &delete_objects).await
     }
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
         let paths = std::array::from_ref(path);
         self.delete_objects(paths).await
     }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        let kind = RequestKind::TimeTravel;
+        let _guard = self.permit(kind).await;
+
+        let timestamp = DateTime::from(timestamp);
+        let done_if_after = DateTime::from(done_if_after);
+
+        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let prefix = prefix
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |_e: &_| false;
+
+        let list = backoff::retry(
+            || async {
+                Ok(self
+                    .client
+                    .list_object_versions()
+                    .bucket(self.bucket_name.clone())
+                    .set_prefix(prefix.clone())
+                    .send()
+                    .await?)
+            },
+            is_permanent,
+            warn_threshold,
+            max_retries,
+            "listing object versions for time_travel_recover",
+            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+        )
+        .await?;
+
+        if list.is_truncated().unwrap_or_default() {
+            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
+        }
+
+        let mut versions_deletes = list
+            .versions()
+            .iter()
+            .map(VerOrDelete::Version)
+            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
+            .collect::<Vec<_>>();
+
+        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+
+        let mut vds_for_key = HashMap::<_, Vec<_>>::new();
+
+        for vd in versions_deletes {
+            let last_modified = vd.last_modified();
+            let version_id = vd.version_id();
+            let key = vd.key();
+            let (Some(last_modified), Some(version_id), Some(key)) =
+                (last_modified, version_id, key)
+            else {
+                anyhow::bail!(
+                    "One (or more) of last_modified, key, and id is None. \
+                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
+                    last_modified, key, version_id,
+                );
+            };
+            if version_id == "null" {
+                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values");
+            }
+            tracing::trace!(
+                "Parsing version key={key} version_id={version_id} is_delete={}",
+                matches!(vd, VerOrDelete::DeleteMarker(_))
+            );
+
+            vds_for_key
+                .entry(key)
+                .or_default()
+                .push((vd, last_modified, version_id));
+        }
+        for (key, versions) in vds_for_key {
+            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
+            if last_last_modified > &&done_if_after {
+                tracing::trace!("Key {key} has version later than done_if_after, skipping");
+                continue;
+            }
+            // the version we want to restore to.
+            let version_to_restore_to =
+                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                    Ok(v) => v,
+                    Err(e) => e,
+                };
+            if version_to_restore_to == versions.len() {
+                tracing::trace!("Key {key} has no changes since timestamp, skipping");
+                continue;
+            }
+            let mut do_delete = false;
+            if version_to_restore_to == 0 {
+                // All versions more recent, so the key didn't exist at the specified time point.
+                tracing::trace!(
+                    "All {} versions more recent for {key}, deleting",
+                    versions.len()
+                );
+                do_delete = true;
+            } else {
+                match &versions[version_to_restore_to - 1] {
+                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                        tracing::trace!("Copying old version {version_id} for {key}...");
+                        // Restore the state to the last version by copying
+                        let source_id =
+                            format!("{}/{key}?versionId={version_id}", self.bucket_name);
+
+                        backoff::retry(
+                            || async {
+                                Ok(self
+                                    .client
+                                    .copy_object()
+                                    .bucket(self.bucket_name.clone())
+                                    .key(key)
+                                    .copy_source(&source_id)
+                                    .send()
+                                    .await?)
+                            },
+                            is_permanent,
+                            warn_threshold,
+                            max_retries,
+                            "listing object versions for time_travel_recover",
+                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                        )
+                        .await?;
+                    }
+                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                        do_delete = true;
+                    }
+                }
+            };
+            if do_delete {
+                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                    // Key has since been deleted (but there was some history), no need to do anything
+                    tracing::trace!("Key {key} already deleted, skipping.");
+                } else {
+                    tracing::trace!("Deleting {key}...");
+
+                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
+                    self.delete_oids(kind, &[oid]).await?;
+                }
+            }
+        }
+        Ok(())
+    }
 }
 
 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -650,6 +811,32 @@ fn start_measuring_requests(
     })
 }
 
+enum VerOrDelete<'a> {
+    Version(&'a ObjectVersion),
+    DeleteMarker(&'a DeleteMarkerEntry),
+}
+
+impl<'a> VerOrDelete<'a> {
+    fn last_modified(&self) -> Option<&'a DateTime> {
+        match self {
+            VerOrDelete::Version(v) => v.last_modified(),
+            VerOrDelete::DeleteMarker(v) => v.last_modified(),
+        }
+    }
+    fn version_id(&self) -> Option<&'a str> {
+        match self {
+            VerOrDelete::Version(v) => v.version_id(),
+            VerOrDelete::DeleteMarker(v) => v.version_id(),
+        }
+    }
+    fn key(&self) -> Option<&'a str> {
+        match self {
+            VerOrDelete::Version(v) => v.key(),
+            VerOrDelete::DeleteMarker(v) => v.key(),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use camino::Utf8Path;
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs
index 21dde14906..beca755920 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -12,6 +12,7 @@ pub(crate) enum RequestKind {
     Delete = 2,
     List = 3,
     Copy = 4,
+    TimeTravel = 5,
 }
 
 use RequestKind::*;
@@ -24,6 +25,7 @@ impl RequestKind {
             Delete => "delete_object",
             List => "list_objects",
             Copy => "copy_object",
+            TimeTravel => "time_travel_recover",
         }
     }
     const fn as_index(&self) -> usize {
@@ -31,7 +33,7 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 5]);
+pub(super) struct RequestTyped<C>([C; 6]);
 
 impl<C> RequestTyped<C> {
     pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -40,8 +42,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy].into_iter();
-        let arr = std::array::from_fn::<C, 5, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
+        let arr = std::array::from_fn::<C, 6, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 90d90163a7..fc4c4b315b 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -5,7 +5,9 @@ use bytes::Bytes;
 use futures::stream::Stream;
 use std::collections::HashMap;
 use std::sync::Mutex;
+use std::time::SystemTime;
 use std::{collections::hash_map::Entry, sync::Arc};
+use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
@@ -30,6 +32,7 @@ enum RemoteOp {
     Download(RemotePath),
     Delete(RemotePath),
     DeleteObjects(Vec<RemotePath>),
+    TimeTravelRecover(Option<RemotePath>),
 }
 
 impl UnreliableWrapper {
@@ -181,4 +184,17 @@ impl RemoteStorage for UnreliableWrapper {
         self.attempt(RemoteOp::Upload(to.clone()))?;
         self.inner.copy_object(from, to).await
     }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
+        self.inner
+            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
+            .await
+    }
 }
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 4a999d115e..9e1b989e4d 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,15 +1,19 @@
-use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::UNIX_EPOCH;
+use std::time::{Duration, UNIX_EPOCH};
+use std::{collections::HashSet, time::SystemTime};
 
+use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
+use camino::Utf8Path;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
+use test_context::test_context;
 use test_context::AsyncTestContext;
+use tokio_util::sync::CancellationToken;
 use tracing::info;
 
 mod common;
@@ -23,6 +27,121 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_
 
 const BASE_PREFIX: &str = "test";
 
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+    // Our test depends on discrepancies in the clock between S3 and the environment the tests
+    // run in. Therefore, wait a little bit before and after. The alternative would be
+    // to take the time from S3 response headers.
+    const WAIT_TIME: Duration = Duration::from_millis(3_000);
+
+    async fn time_point() -> SystemTime {
+        tokio::time::sleep(WAIT_TIME).await;
+        let ret = SystemTime::now();
+        tokio::time::sleep(WAIT_TIME).await;
+        ret
+    }
+
+    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
+        Ok(client
+            .list_files(None)
+            .await
+            .context("list root files failure")?
+            .into_iter()
+            .collect::<HashSet<_>>())
+    }
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
+
+    let t0_files = list_files(&ctx.client).await?;
+    let t0 = time_point().await;
+    println!("at t0: {t0_files:?}");
+
+    let old_data = "remote blob data2";
+    let (data, len) = upload_stream(old_data.as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    let t1_files = list_files(&ctx.client).await?;
+    let t1 = time_point().await;
+    println!("at t1: {t1_files:?}");
+
+    // A little check to ensure that our clock is not too far off from the S3 clock
+    {
+        let dl = ctx.client.download(&path2).await?;
+        let last_modified = dl.last_modified.unwrap();
+        let half_wt = WAIT_TIME.mul_f32(0.5);
+        let t0_hwt = t0 + half_wt;
+        let t1_hwt = t1 - half_wt;
+        if !(t0_hwt..=t1_hwt).contains(&last_modified) {
+            panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
+                This likely means a large lock discrepancy between S3 and the local clock.");
+        }
+    }
+
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
+
+    let new_data = "new remote blob data2";
+    let (data, len) = upload_stream(new_data.as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    ctx.client.delete(&path1).await?;
+
+    let t2_files = list_files(&ctx.client).await?;
+    let t2 = time_point().await;
+    println!("at t2: {t2_files:?}");
+
+    // No changes after recovery to t2 (no-op)
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t2, t_final, CancellationToken::new())
+        .await?;
+    let t2_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t2: {t2_files_recovered:?}");
+    assert_eq!(t2_files, t2_files_recovered);
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t2, new_data.as_bytes());
+
+    // after recovery to t1: path1 is back, path2 has the old content
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t1, t_final, CancellationToken::new())
+        .await?;
+    let t1_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t1: {t1_files_recovered:?}");
+    assert_eq!(t1_files, t1_files_recovered);
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t1, old_data.as_bytes());
+
+    // after recovery to t0: everything is gone except for path1
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t0, t_final, CancellationToken::new())
+        .await?;
+    let t0_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t0: {t0_files_recovered:?}");
+    assert_eq!(t0_files, t0_files_recovered);
+
+    // cleanup
+    ctx.client.delete_objects(&[path1, path2, path3]).await?;
+
+    Ok(())
+}
+
 struct EnabledS3 {
     client: Arc<GenericRemoteStorage>,
     base_prefix: &'static str,

From fd4cce9417484a9cede46a43e14cffb2c32748be Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 Jan 2024 19:17:53 +0100
Subject: [PATCH 54/70] test_pageserver_max_throughput_getpage_at_latest_lsn:
 remove n_tenants=100 combination (#6477)

Need to fix the neon_local timeouts first
(https://github.com/neondatabase/neon/issues/6473)
and also not run them on every merge, but only nightly:
https://github.com/neondatabase/neon/issues/6476
---
 .../test_pageserver_max_throughput_getpage_at_latest_lsn.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index f1f5511453..1ed7e577b9 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -29,7 +29,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
 # 46G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
 @pytest.mark.parametrize("duration", [30])
 @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
-@pytest.mark.parametrize("n_tenants", [1, 10, 100])
+@pytest.mark.parametrize("n_tenants", [1, 10])
 @pytest.mark.timeout(
     10000
 )  # TODO: this value is just "a really high number"; have this per instance type

From 689ad72e92955c8e7f2c3e640b0ae6299fbb9276 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 Jan 2024 19:20:02 +0100
Subject: [PATCH 55/70] fix(neon_local): leaks child process if it fails to
 start & pass checks (#6474)

refs https://github.com/neondatabase/neon/issues/6473

Before this PR, if process_started() didn't return Ok(true) until we
ran out of retries, we'd return an error but leave the process running.

Try it by adding a 20s sleep to the pageserver `main()`, e.g., right
before we claim the pidfile.

Without this PR, output looks like so:

```
(.venv) cs@devvm-mbp:[~/src/neon-work-2]: ./target/debug/neon_local start
Starting neon broker at 127.0.0.1:50051.
storage_broker started, pid: 2710939
.
attachment_service started, pid: 2710949
Starting pageserver node 1 at '127.0.0.1:64000' in ".neon/pageserver_1".....
pageserver has not started yet, continuing to wait.....
pageserver 1 start failed: pageserver did not start in 10 seconds
No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: ".neon/pageserver_1/pageserver.pid"
No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: ".neon/safekeepers/sk1/safekeeper.pid"
Stopping storage_broker with pid 2710939 immediately.......
storage_broker has not stopped yet, continuing to wait.....
neon broker stop failed: storage_broker with pid 2710939 did not stop in 10 seconds
Stopping attachment_service with pid 2710949 immediately.......
attachment_service has not stopped yet, continuing to wait.....
attachment service stop failed: attachment_service with pid 2710949 did not stop in 10 seconds
```

and we leak the pageserver process

```
(.venv) cs@devvm-mbp:[~/src/neon-work-2]: ps aux | grep pageserver
cs       2710959  0.0  0.2 2377960 47616 pts/4   Sl   14:36   0:00 /home/cs/src/neon-work-2/target/debug/pageserver -D .neon/pageserver_1 -c id=1 -c pg_distrib_dir='/home/cs/src/neon-work-2/pg_install' -c http_auth_type='Trust' -c pg_auth_type='Trust' -c listen_http_addr='127.0.0.1:9898' -c listen_pg_addr='127.0.0.1:64000' -c broker_endpoint='http://127.0.0.1:50051/' -c control_plane_api='http://127.0.0.1:1234/' -c remote_storage={local_path='../local_fs_remote_storage/pageserver'}
```

After this PR, there is no leaked process.
---
 Cargo.lock                              |  1 +
 control_plane/Cargo.toml                |  1 +
 control_plane/src/attachment_service.rs |  5 ++--
 control_plane/src/background_process.rs | 33 +++++++++++++++++--------
 control_plane/src/pageserver.rs         |  6 ++---
 control_plane/src/safekeeper.rs         |  3 +--
 6 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f2f31192f0..f0e8b6a0ed 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1342,6 +1342,7 @@ dependencies = [
  "regex",
  "reqwest",
  "safekeeper_api",
+ "scopeguard",
  "serde",
  "serde_json",
  "serde_with",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 898ad05add..75e5dcb7f8 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -19,6 +19,7 @@ hex.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
+scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 0a353d8b12..2d43c46270 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,7 +9,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{path::PathBuf, process::Child, str::FromStr};
+use std::{path::PathBuf, str::FromStr};
 use tracing::instrument;
 use utils::{
     auth::{Claims, Scope},
@@ -220,7 +220,7 @@ impl AttachmentService {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self) -> anyhow::Result<Child> {
+    pub async fn start(&self) -> anyhow::Result<()> {
         let path_str = self.path.to_string_lossy();
 
         let mut args = vec!["-l", &self.listen, "-p", &path_str]
@@ -254,6 +254,7 @@ impl AttachmentService {
         )
         .await;
 
+        // TODO: shouldn't we bail if we fail to spawn the process?
         for ps_conf in &self.env.pageservers {
             let (pg_host, pg_port) =
                 parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 20fa3af9b8..3ffb8734d0 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -17,7 +17,7 @@ use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;
 use std::{fs, io, thread};
 
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
     envs: EI,
     initial_pid_file: InitialPidFile,
     process_status_check: F,
-) -> anyhow::Result<Child>
+) -> anyhow::Result<()>
 where
     F: Fn() -> Fut,
     Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -98,7 +98,7 @@ where
         InitialPidFile::Expect(path) => path,
     };
 
-    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+    let spawned_process = filled_cmd.spawn().with_context(|| {
         format!("Could not spawn {process_name}, see console output and log files for details.")
     })?;
     let pid = spawned_process.id();
@@ -106,12 +106,26 @@ where
         i32::try_from(pid)
             .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
     );
+    // set up a scopeguard to kill & wait for the child in case we panic or bail below
+    let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
+        println!("SIGKILL & wait the started process");
+        (|| {
+            // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
+            spawned_process.kill().context("SIGKILL child")?;
+            spawned_process.wait().context("wait() for child process")?;
+            anyhow::Ok(())
+        })()
+        .with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
+        .unwrap();
+    });
 
     for retries in 0..RETRIES {
         match process_started(pid, pid_file_to_check, &process_status_check).await {
             Ok(true) => {
-                println!("\n{process_name} started, pid: {pid}");
-                return Ok(spawned_process);
+                println!("\n{process_name} started and passed status check, pid: {pid}");
+                // leak the child process, it'll outlive this neon_local invocation
+                drop(scopeguard::ScopeGuard::into_inner(spawned_process));
+                return Ok(());
             }
             Ok(false) => {
                 if retries == NOTICE_AFTER_RETRIES {
@@ -126,16 +140,15 @@ where
                 thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
             }
             Err(e) => {
-                println!("{process_name} failed to start: {e:#}");
-                if let Err(e) = spawned_process.kill() {
-                    println!("Could not stop {process_name} subprocess: {e:#}")
-                };
+                println!("error starting process {process_name:?}: {e:#}");
                 return Err(e);
             }
         }
     }
     println!();
-    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
+    anyhow::bail!(
+        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
+    );
 }
 
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 18ccf6bd98..1db21c9a37 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,7 +11,7 @@ use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;
 
 use anyhow::{bail, Context};
@@ -161,7 +161,7 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
         self.start_node(config_overrides, false).await
     }
 
@@ -207,7 +207,7 @@ impl PageServerNode {
         &self,
         config_overrides: &[&str],
         update_config: bool,
-    ) -> anyhow::Result<Child> {
+    ) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 4026ef0eb9..6ac71dfe51 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,7 +7,6 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Child;
 use std::{io, result};
 
 use anyhow::Context;
@@ -104,7 +103,7 @@ impl SafekeeperNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
         print!(
             "Starting safekeeper at '{}' in '{}'",
             self.pg_connection_config.raw_address(),

From d36623ad74c1d4a974d95cff00a8f463ba615254 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 25 Jan 2024 19:25:29 +0000
Subject: [PATCH 56/70] CI: cancel old e2e-tests on new commits (#6463)

## Problem

Triggered `e2e-tests` job is not cancelled along with other jobs in a PR
if the PR get new commits. We can improve the situation by setting
`concurrency_group` for the remote workflow
(https://github.com/neondatabase/cloud/pull/9622 adds
`concurrency_group` group input to the remote workflow).

Ref https://neondb.slack.com/archives/C059ZC138NR/p1706087124297569

Cloud's part added in https://github.com/neondatabase/cloud/pull/9622

## Summary of changes
- Set `concurrency_group` parameter when triggering `e2e-tests`
- At the beginning of a CI pipeline, trigger Cloud's
`cancel-previous-in-concurrency-group.yml` workflow which cancels
previously triggered e2e-tests
---
 .github/workflows/build_and_test.yml | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index fb88d4da96..643d24696d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -22,6 +22,8 @@ env:
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
   NEXTEST_RETRIES: 3
+  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
 jobs:
   check-permissions:
@@ -45,6 +47,20 @@ jobs:
 
         exit 1
 
+  cancel-previous-e2e-tests:
+    needs: [ check-permissions ]
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Cancel previous e2e-tests runs for this PR
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          gh workflow --repo neondatabase/cloud \
+            run cancel-previous-in-concurrency-group.yml \
+              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
+
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
@@ -696,7 +712,8 @@ jobs:
                 \"commit_hash\": \"$COMMIT_SHA\",
                 \"remote_repo\": \"${{ github.repository }}\",
                 \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
               }
             }"
 

From 918b03b3b01412bdd1b0f1a5afb90a3c0522a96b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 09:25:07 +0100
Subject: [PATCH 57/70] integrate tokio-epoll-uring as alternative VirtualFile
 IO engine (#5824)

---
 .github/workflows/build_and_test.yml          |  23 +-
 Cargo.lock                                    |  43 +-
 Cargo.toml                                    |   1 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/ctl/src/layer_map_analyzer.rs      |   4 +-
 pageserver/ctl/src/layers.rs                  |   4 +-
 pageserver/ctl/src/main.rs                    |   2 +-
 pageserver/src/bin/pageserver.rs              |   2 +-
 pageserver/src/config.rs                      |  26 ++
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/metrics.rs                     |  15 +
 pageserver/src/tenant/block_io.rs             |  22 +-
 pageserver/src/tenant/ephemeral_file.rs       |  18 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  18 +-
 pageserver/src/virtual_file.rs                | 428 +++++++++++++++---
 pageserver/src/virtual_file/io_engine.rs      | 114 +++++
 pageserver/src/virtual_file/open_options.rs   | 138 ++++++
 scripts/flaky_tests.py                        |  14 +-
 test_runner/fixtures/neon_fixtures.py         |  12 +
 test_runner/fixtures/parametrize.py           |  22 +-
 21 files changed, 794 insertions(+), 118 deletions(-)
 create mode 100644 pageserver/src/virtual_file/io_engine.rs
 create mode 100644 pageserver/src/virtual_file/open_options.rs

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 643d24696d..7445501f00 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,7 +203,11 @@ jobs:
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      options: --init
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
       fail-fast: false
       matrix:
@@ -358,7 +362,9 @@ jobs:
 
       - name: Run rust tests
         run: |
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -436,8 +442,8 @@ jobs:
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
       fail-fast: false
       matrix:
@@ -465,6 +471,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
 
       - name: Merge and upload coverage data
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -475,12 +482,13 @@ jobs:
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     strategy:
       fail-fast: false
       matrix:
+        # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
         pytest_split_group: [ 1, 2, 3, 4 ]
         build_type: [ release ]
     steps:
@@ -494,11 +502,12 @@ jobs:
           test_selection: performance
           run_in_parallel: false
           save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
+          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
diff --git a/Cargo.lock b/Cargo.lock
index f0e8b6a0ed..6e91363de8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2564,6 +2564,16 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "io-uring"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762"
+dependencies = [
+ "bitflags 1.3.2",
+ "libc",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.9.0"
@@ -3362,6 +3372,7 @@ dependencies = [
  "tenant_size_model",
  "thiserror",
  "tokio",
+ "tokio-epoll-uring",
  "tokio-io-timeout",
  "tokio-postgres",
  "tokio-stream",
@@ -5383,18 +5394,18 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
+checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
+checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5518,6 +5529,21 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "tokio-epoll-uring"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "futures",
+ "once_cell",
+ "scopeguard",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "uring-common",
+]
+
 [[package]]
 name = "tokio-io-timeout"
 version = "1.2.0"
@@ -6027,6 +6053,15 @@ dependencies = [
  "webpki-roots 0.23.1",
 ]
 
+[[package]]
+name = "uring-common"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "io-uring",
+ "libc",
+]
+
 [[package]]
 name = "url"
 version = "2.3.1"
diff --git a/Cargo.toml b/Cargo.toml
index eefd1cb114..8afab02b15 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -151,6 +151,7 @@ test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 980fbab22e..e44501d1ed 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -61,6 +61,7 @@ sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 15d4eb09e0..eb5c3f15cf 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
-use pageserver::virtual_file::VirtualFile;
+use pageserver::virtual_file::{self, VirtualFile};
 
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10);
+    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index ebf4a4bec3..dbbcfedac0 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     page_cache::init(100);
     let file = FileBlockReader::new(VirtualFile::open(path).await?);
     let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10);
+            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index fb42d6d2f1..3c90933fe9 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 15e3359c06..84de76e55e 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors);
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 52277d7f24..1989bef817 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -36,6 +36,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
+use crate::virtual_file;
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
     TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
@@ -43,6 +44,8 @@ use crate::{
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
 
+use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
+
 pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
@@ -79,6 +82,8 @@ pub mod defaults {
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
+    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -114,6 +119,8 @@ pub mod defaults {
 
 #ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
 
+#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -247,6 +254,8 @@ pub struct PageServerConf {
 
     /// Maximum number of WAL records to be ingested and committed at the same time
     pub ingest_batch_size: u64,
+
+    pub virtual_file_io_engine: virtual_file::IoEngineKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -331,6 +340,8 @@ struct PageServerConfigBuilder {
     secondary_download_concurrency: BuilderValue<usize>,
 
     ingest_batch_size: BuilderValue<u64>,
+
+    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -406,6 +417,8 @@ impl Default for PageServerConfigBuilder {
             secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
 
             ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
+
+            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
         }
     }
 }
@@ -562,6 +575,10 @@ impl PageServerConfigBuilder {
         self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
     }
 
+    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
+        self.virtual_file_io_engine = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -669,6 +686,9 @@ impl PageServerConfigBuilder {
             ingest_batch_size: self
                 .ingest_batch_size
                 .ok_or(anyhow!("missing ingest_batch_size"))?,
+            virtual_file_io_engine: self
+                .virtual_file_io_engine
+                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
         })
     }
 }
@@ -920,6 +940,9 @@ impl PageServerConf {
                     builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
                 },
                 "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
+                "virtual_file_io_engine" => {
+                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -993,6 +1016,7 @@ impl PageServerConf {
             heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
             secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
         }
     }
 }
@@ -1225,6 +1249,7 @@ background_task_maximum_delay = '334 s'
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1288,6 +1313,7 @@ background_task_maximum_delay = '334 s'
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 26070e0cc1..bcde1166b7 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,4 @@
+#![recursion_limit = "300"]
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 mod auth;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 993685db6e..2cfa77f1c5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -932,6 +932,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+#[cfg(not(test))]
 pub(crate) mod virtual_file_descriptor_cache {
     use super::*;
 
@@ -951,6 +952,20 @@ pub(crate) mod virtual_file_descriptor_cache {
     // ```
 }
 
+#[cfg(not(test))]
+pub(crate) mod virtual_file_io_engine {
+    use super::*;
+
+    pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
+        register_uint_gauge_vec!(
+            "pageserver_virtual_file_io_engine_kind",
+            "The configured io engine for VirtualFile",
+            &["kind"],
+        )
+        .unwrap()
+    });
+}
+
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
     global: Histogram,
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 0617017528..1b6bccc120 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,10 +5,10 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
-use std::ops::{Deref, DerefMut};
+use std::ops::Deref;
 
 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -39,6 +39,8 @@ pub enum BlockLease<'a> {
     EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
+    #[cfg(test)]
+    Vec(Vec<u8>),
 }
 
 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> {
             BlockLease::EphemeralFileMutableTail(v) => v,
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
+            #[cfg(test)]
+            BlockLease::Vec(v) => {
+                TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
+            }
         }
     }
 }
@@ -169,10 +175,14 @@ impl FileBlockReader {
     }
 
     /// Read a page from the underlying file into given buffer.
-    async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+    async fn fill_buffer(
+        &self,
+        buf: PageWriteGuard<'static>,
+        blkno: u32,
+    ) -> Result<PageWriteGuard<'static>, std::io::Error> {
         assert!(buf.len() == PAGE_SZ);
         self.file
-            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
             .await
     }
     /// Read a block.
@@ -196,9 +206,9 @@ impl FileBlockReader {
                 )
             })? {
             ReadBufResult::Found(guard) => Ok(guard.into()),
-            ReadBufResult::NotFound(mut write_guard) => {
+            ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
-                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum).await?;
                 Ok(write_guard.mark_valid().into())
             }
         }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 591eacd104..6b8cd77d78 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
-use std::fs::OpenOptions;
+
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
@@ -47,7 +47,10 @@ impl EphemeralFile {
 
         let file = VirtualFile::open_with_options(
             &filename,
-            OpenOptions::new().read(true).write(true).create(true),
+            virtual_file::OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create(true),
         )
         .await?;
 
@@ -89,11 +92,10 @@ impl EphemeralFile {
                 page_cache::ReadBufResult::Found(guard) => {
                     return Ok(BlockLease::PageReadGuard(guard))
                 }
-                page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                    let buf: &mut [u8] = write_guard.deref_mut();
-                    debug_assert_eq!(buf.len(), PAGE_SZ);
-                    self.file
-                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                page_cache::ReadBufResult::NotFound(write_guard) => {
+                    let write_guard = self
+                        .file
+                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
                         .await?;
                     let read_guard = write_guard.mark_valid();
                     return Ok(BlockLease::PageReadGuard(read_guard));
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 4ded6d6a8d..3a445ef71e 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
@@ -649,7 +649,7 @@ impl DeltaLayer {
     {
         let file = VirtualFile::open_with_options(
             path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f03c7642eb..c62e6aed51 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
@@ -327,7 +327,7 @@ impl ImageLayer {
     {
         let file = VirtualFile::open_with_options(
             path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -492,11 +492,15 @@ impl ImageLayerWriterInner {
             },
         );
         info!("new image layer {path}");
-        let mut file = VirtualFile::open_with_options(
-            &path,
-            std::fs::OpenOptions::new().write(true).create_new(true),
-        )
-        .await?;
+        let mut file = {
+            VirtualFile::open_with_options(
+                &path,
+                virtual_file::OpenOptions::new()
+                    .write(true)
+                    .create_new(true),
+            )
+            .await?
+        };
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 06f58b5c52..d200a4ba5e 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -11,18 +11,28 @@
 //! src/backend/storage/file/fd.c
 //!
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
+
+use crate::page_cache::PageWriteGuard;
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File, OpenOptions};
+use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+use tokio_epoll_uring::IoBufMut;
+
+use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;
 
+mod io_engine;
+mod open_options;
+pub use io_engine::IoEngineKind;
+pub(crate) use open_options::*;
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -106,7 +116,38 @@ struct SlotInner {
     tag: u64,
 
     /// the underlying file
-    file: Option<File>,
+    file: Option<OwnedFd>,
+}
+
+/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
+struct PageWriteGuardBuf {
+    page: PageWriteGuard<'static>,
+    init_up_to: usize,
+}
+// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
+// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
+unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
+    fn stable_ptr(&self) -> *const u8 {
+        self.page.as_ptr()
+    }
+    fn bytes_init(&self) -> usize {
+        self.init_up_to
+    }
+    fn bytes_total(&self) -> usize {
+        self.page.len()
+    }
+}
+// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
+// hence it's safe to hand out the `stable_mut_ptr()`.
+unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.page.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, pos: usize) {
+        assert!(pos <= self.page.len());
+        self.init_up_to = pos;
+    }
 }
 
 impl OpenFiles {
@@ -274,6 +315,10 @@ macro_rules! with_file {
         let $ident = $this.lock_file().await?;
         observe_duration!($op, $($body)*)
     }};
+    ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
+        let mut $ident = $this.lock_file().await?;
+        observe_duration!($op, $($body)*)
+    }};
 }
 
 impl VirtualFile {
@@ -326,7 +371,9 @@ impl VirtualFile {
         // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
         // where our caller doesn't get to use the returned VirtualFile before its
         // slot gets re-used by someone else.
-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = observe_duration!(StorageIoOperation::Open, {
+            open_options.open(path.as_std_path()).await?
+        });
 
         // Strip all options other than read and write.
         //
@@ -395,15 +442,13 @@ impl VirtualFile {
 
     /// Call File::sync_all() on the underlying File.
     pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.sync_all()))
     }
 
     pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.metadata()))
     }
 
     /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -412,7 +457,7 @@ impl VirtualFile {
     ///
     /// We are doing it via a macro as Rust doesn't support async closures that
     /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    async fn lock_file(&self) -> Result<FileGuard, Error> {
         let open_files = get_open_files();
 
         let mut handle_guard = {
@@ -458,10 +503,9 @@ impl VirtualFile {
         // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
         // case from StorageIoOperation::Open. This helps with identifying thrashing
         // of the virtual file descriptor cache.
-        let file = observe_duration!(
-            StorageIoOperation::OpenAfterReplace,
-            self.open_options.open(&self.path)
-        )?;
+        let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
+            self.open_options.open(self.path.as_std_path()).await?
+        });
 
         // Store the File in the slot and update the handle in the VirtualFile
         // to point to it.
@@ -486,9 +530,8 @@ impl VirtualFile {
                 self.pos = offset;
             }
             SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
+                    .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
             }
             SeekFrom::Current(offset) => {
                 let pos = self.pos as i128 + offset as i128;
@@ -507,25 +550,28 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-    pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
-        while !buf.is_empty() {
-            match self.read_at(buf, offset).await {
-                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::UnexpectedEof,
-                        "failed to fill whole buffer",
-                    ))
-                }
-                Ok(n) => {
-                    buf = &mut buf[n..];
-                    offset += n as u64;
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-        }
-        Ok(())
+    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) =
+            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        let buf = PageWriteGuardBuf {
+            page,
+            init_up_to: 0,
+        };
+        let res = self.read_exact_at(buf, offset).await;
+        res.map(|PageWriteGuardBuf { page, .. }| page)
+            .map_err(|e| Error::new(ErrorKind::Other, e))
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -575,22 +621,35 @@ impl VirtualFile {
         Ok(n)
     }
 
-    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+
+        observe_duration!(StorageIoOperation::Read, {
+            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
+            if let Ok(size) = res {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "read",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, res)
+        })
     }
 
     async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
+            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
+        });
         if let Ok(size) = result {
             STORAGE_IO_SIZE
                 .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
@@ -600,18 +659,241 @@ impl VirtualFile {
     }
 }
 
-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
+// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
+pub async fn read_exact_at_impl<B, F, Fut>(
+    buf: B,
+    mut offset: u64,
+    mut read_at: F,
+) -> (B, std::io::Result<()>)
+where
+    B: IoBufMut + Send,
+    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
+    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
+{
+    use tokio_epoll_uring::BoundedBuf;
+    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    while buf.bytes_total() != 0 {
+        let res;
+        (buf, res) = read_at(buf, offset).await;
+        match res {
+            Ok(0) => break,
+            Ok(n) => {
+                buf = buf.slice(n..);
+                offset += n as u64;
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+            Err(e) => return (buf.into_inner(), Err(e)),
+        }
+    }
+    // NB: don't use `buf.is_empty()` here; it is from the
+    // `impl Deref for Slice { Target = [u8] }`; the the &[u8]
+    // returned by it only covers the initialized portion of `buf`.
+    // Whereas we're interested in ensuring that we filled the entire
+    // buffer that the user passed in.
+    if buf.bytes_total() != 0 {
+        (
+            buf.into_inner(),
+            Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                "failed to fill whole buffer",
+            )),
+        )
+    } else {
+        assert_eq!(buf.len(), buf.bytes_total());
+        (buf.into_inner(), Ok(()))
+    }
 }
 
-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
+#[cfg(test)]
+mod test_read_exact_at_impl {
+
+    use std::{collections::VecDeque, sync::Arc};
+
+    use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
+
+    use super::read_exact_at_impl;
+
+    struct Expectation {
+        offset: u64,
+        bytes_total: usize,
+        result: std::io::Result<Vec<u8>>,
+    }
+    struct MockReadAt {
+        expectations: VecDeque<Expectation>,
+    }
+
+    impl MockReadAt {
+        async fn read_at(
+            &mut self,
+            mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
+            offset: u64,
+        ) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
+            let exp = self
+                .expectations
+                .pop_front()
+                .expect("read_at called but we have no expectations left");
+            assert_eq!(exp.offset, offset);
+            assert_eq!(exp.bytes_total, buf.bytes_total());
+            match exp.result {
+                Ok(bytes) => {
+                    assert!(bytes.len() <= buf.bytes_total());
+                    buf.put_slice(&bytes);
+                    (buf, Ok(bytes.len()))
+                }
+                Err(e) => (buf, Err(e)),
+            }
+        }
+    }
+
+    impl Drop for MockReadAt {
+        fn drop(&mut self) {
+            assert_eq!(self.expectations.len(), 0);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_basic() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 5,
+                result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
+            }]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
+    }
+
+    #[tokio::test]
+    async fn test_empty_buf_issues_no_syscall() {
+        let buf = Vec::new();
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::new(),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_two_read_at_calls_needed_until_buf_filled() {
+        let buf = Vec::with_capacity(4);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 4,
+                    result: Ok(vec![b'a', b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 2,
+                    result: Ok(vec![b'c', b'd']),
+                },
+            ]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
+    }
+
+    #[tokio::test]
+    async fn test_eof_before_buffer_full() {
+        let buf = Vec::with_capacity(3);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 3,
+                    result: Ok(vec![b'a']),
+                },
+                Expectation {
+                    offset: 1,
+                    bytes_total: 2,
+                    result: Ok(vec![b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 1,
+                    result: Ok(vec![]),
+                },
+            ]),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        let Err(err) = res else {
+            panic!("should return an error");
+        };
+        assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
+        assert_eq!(format!("{err}"), "failed to fill whole buffer");
+        // buffer contents on error are unspecified
+    }
+}
+
+struct FileGuard {
+    slot_guard: RwLockReadGuard<'static, SlotInner>,
+}
+
+impl AsRef<OwnedFd> for FileGuard {
+    fn as_ref(&self) -> &OwnedFd {
         // This unwrap is safe because we only create `FileGuard`s
         // if we know that the file is Some.
         self.slot_guard.file.as_ref().unwrap()
     }
 }
 
+impl FileGuard {
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file<F, R>(&self, with: F) -> R
+    where
+        F: FnOnce(&File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
+        let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&file);
+        let _ = file.into_raw_fd();
+        res
+    }
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file_mut<F, R>(&mut self, with: F) -> R
+    where
+        F: FnOnce(&mut File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
+        let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&mut file);
+        let _ = file.into_raw_fd();
+        res
+    }
+}
+
+impl tokio_epoll_uring::IoFd for FileGuard {
+    unsafe fn as_fd(&self) -> RawFd {
+        let owned_fd: &OwnedFd = self.as_ref();
+        owned_fd.as_raw_fd()
+    }
+}
+
 #[cfg(test)]
 impl VirtualFile {
     pub(crate) async fn read_blk(
@@ -619,16 +901,19 @@ impl VirtualFile {
         blknum: u32,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
-        let mut buf = [0; PAGE_SZ];
-        self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
+        let buf = vec![0; PAGE_SZ];
+        let buf = self
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
             .await?;
-        Ok(std::sync::Arc::new(buf).into())
+        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
     }
 
     async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
         loop {
-            let mut tmp = [0; 128];
-            match self.read_at(&mut tmp, self.pos).await {
+            let res;
+            (tmp, res) = self.read_at(tmp, self.pos).await;
+            match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
                     self.pos += n as u64;
@@ -704,10 +989,12 @@ impl OpenFiles {
 /// Initialize the virtual file module. This must be called once at page
 /// server startup.
 ///
-pub fn init(num_slots: usize) {
+#[cfg(not(test))]
+pub fn init(num_slots: usize, engine: IoEngineKind) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
 
@@ -752,10 +1039,10 @@ mod tests {
     }
 
     impl MaybeVirtualFile {
-        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
+        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
+                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
         async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
@@ -797,14 +1084,14 @@ mod tests {
 
         // Helper function to slurp a portion of a file into a string
         async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
-            let mut buf = vec![0; len];
-            self.read_exact_at(&mut buf, pos).await?;
+            let buf = vec![0; len];
+            let buf = self.read_exact_at(buf, pos).await?;
             Ok(String::from_utf8(buf).unwrap())
         }
     }
 
     #[tokio::test]
-    async fn test_virtual_files() -> Result<(), Error> {
+    async fn test_virtual_files() -> anyhow::Result<()> {
         // The real work is done in the test_files() helper function. This
         // allows us to run the same set of tests against a native File, and
         // VirtualFile. We trust the native Files and wouldn't need to test them,
@@ -820,14 +1107,17 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_physical_files() -> Result<(), Error> {
+    async fn test_physical_files() -> anyhow::Result<()> {
         test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File(open_options.open(path)?))
+            Ok(MaybeVirtualFile::File({
+                let owned_fd = open_options.open(path.as_std_path()).await?;
+                File::from(owned_fd)
+            }))
         })
         .await
     }
 
-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
+    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
     where
         OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
         FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
@@ -971,11 +1261,11 @@ mod tests {
         for _threadno in 0..THREADS {
             let files = files.clone();
             let hdl = rt.spawn(async move {
-                let mut buf = [0u8; SIZE];
+                let mut buf = vec![0u8; SIZE];
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    f.read_exact_at(&mut buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0).await.unwrap();
                     assert!(buf == SAMPLE);
                 }
             });
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
new file mode 100644
index 0000000000..f7b46fe653
--- /dev/null
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -0,0 +1,114 @@
+//! [`super::VirtualFile`] supports different IO engines.
+//!
+//! The [`IoEngineKind`] enum identifies them.
+//!
+//! The choice of IO engine is global.
+//! Initialize using [`init`].
+//!
+//! Then use [`get`] and  [`super::OpenOptions`].
+
+#[derive(
+    Copy,
+    Clone,
+    PartialEq,
+    Eq,
+    Hash,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+    Debug,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum IoEngineKind {
+    StdFs,
+    #[cfg(target_os = "linux")]
+    TokioEpollUring,
+}
+
+static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
+
+#[cfg(not(test))]
+pub(super) fn init(engine: IoEngineKind) {
+    if IO_ENGINE.set(engine).is_err() {
+        panic!("called twice");
+    }
+    crate::metrics::virtual_file_io_engine::KIND
+        .with_label_values(&[&format!("{engine}")])
+        .set(1);
+}
+
+pub(super) fn get() -> &'static IoEngineKind {
+    #[cfg(test)]
+    {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
+            Ok(v) => match v.parse::<IoEngineKind>() {
+                Ok(engine_kind) => engine_kind,
+                Err(e) => {
+                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                }
+            },
+            Err(std::env::VarError::NotPresent) => {
+                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                    .parse()
+                    .unwrap()
+            }
+            Err(std::env::VarError::NotUnicode(_)) => {
+                panic!("env var {env_var_name} is not unicode");
+            }
+        })
+    }
+    #[cfg(not(test))]
+    IO_ENGINE.get().unwrap()
+}
+
+use std::os::unix::prelude::FileExt;
+
+use super::FileGuard;
+
+impl IoEngineKind {
+    pub(super) async fn read_at<B>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        mut buf: B,
+    ) -> ((FileGuard, B), std::io::Result<usize>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        match self {
+            IoEngineKind::StdFs => {
+                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
+                let dst = unsafe {
+                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
+                };
+                let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
+                if let Ok(nbytes) = &res {
+                    assert!(*nbytes <= buf.bytes_total());
+                    // SAFETY: see above assertion
+                    unsafe {
+                        buf.set_init(*nbytes);
+                    }
+                }
+                #[allow(dropping_references)]
+                drop(dst);
+                ((file_guard, buf), res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.read(file_guard, offset, buf).await;
+                (
+                    resources,
+                    res.map_err(|e| match e {
+                        tokio_epoll_uring::Error::Op(e) => e,
+                        tokio_epoll_uring::Error::System(system) => {
+                            std::io::Error::new(std::io::ErrorKind::Other, system)
+                        }
+                    }),
+                )
+            }
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
new file mode 100644
index 0000000000..1e5ffe15cc
--- /dev/null
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -0,0 +1,138 @@
+//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
+
+use super::IoEngineKind;
+use std::{os::fd::OwnedFd, path::Path};
+
+#[derive(Debug, Clone)]
+pub enum OpenOptions {
+    StdFs(std::fs::OpenOptions),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions),
+}
+
+impl Default for OpenOptions {
+    fn default() -> Self {
+        match super::io_engine::get() {
+            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
+            }
+        }
+    }
+}
+
+impl OpenOptions {
+    pub fn new() -> OpenOptions {
+        Self::default()
+    }
+
+    pub fn read(&mut self, read: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.read(read);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.read(read);
+            }
+        }
+        self
+    }
+
+    pub fn write(&mut self, write: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.write(write);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.write(write);
+            }
+        }
+        self
+    }
+
+    pub fn create(&mut self, create: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create(create);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create(create);
+            }
+        }
+        self
+    }
+
+    pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create_new(create_new);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create_new(create_new);
+            }
+        }
+        self
+    }
+
+    pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.truncate(truncate);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.truncate(truncate);
+            }
+        }
+        self
+    }
+
+    pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
+        match self {
+            OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                system.open(path, x).await.map_err(|e| match e {
+                    tokio_epoll_uring::Error::Op(e) => e,
+                    tokio_epoll_uring::Error::System(system) => {
+                        std::io::Error::new(std::io::ErrorKind::Other, system)
+                    }
+                })
+            }
+        }
+    }
+}
+
+impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
+    fn mode(&mut self, mode: u32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.mode(mode);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.mode(mode);
+            }
+        }
+        self
+    }
+
+    fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.custom_flags(flags);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.custom_flags(flags);
+            }
+        }
+        self
+    }
+}
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index b07e4bea9b..61a97f520d 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -3,6 +3,7 @@
 import argparse
 import json
 import logging
+import os
 from collections import defaultdict
 from typing import DefaultDict, Dict
 
@@ -45,6 +46,15 @@ def main(args: argparse.Namespace):
         logging.error("cannot fetch flaky tests from the DB due to an error", exc)
         rows = []
 
+    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
+    # use it to parametrize test name along with build_type and pg_version
+    #
+    # See test_runner/fixtures/parametrize.py for details
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
+    else:
+        pageserver_virtual_file_io_engine_parameter = ""
+
     for row in rows:
         # We don't want to automatically rerun tests in a performance suite
         if row["parent_suite"] != "test_runner.regress":
@@ -53,10 +63,10 @@ def main(args: argparse.Namespace):
         if row["name"].endswith("]"):
             parametrized_test = row["name"].replace(
                 "[",
-                f"[{build_type}-pg{pg_version}-",
+                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-",
             )
         else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]"
 
         res[row["parent_suite"]][row["suite"]][parametrized_test] = True
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fd5e77671b..142c97d5c3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -446,6 +446,7 @@ class NeonEnvBuilder:
         preserve_database_files: bool = False,
         initial_tenant: Optional[TenantId] = None,
         initial_timeline: Optional[TimelineId] = None,
+        pageserver_virtual_file_io_engine: Optional[str] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -481,6 +482,8 @@ class NeonEnvBuilder:
         self.config_init_force: Optional[str] = None
         self.top_output_dir = top_output_dir
 
+        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -995,6 +998,8 @@ class NeonEnv:
             self, config.auth_enabled
         )
 
+        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
+
         # Create a config file corresponding to the options
         cfg: Dict[str, Any] = {
             "default_tenant_id": str(self.initial_tenant),
@@ -1026,6 +1031,9 @@ class NeonEnv:
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
             }
+            if self.pageserver_virtual_file_io_engine is not None:
+                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
@@ -1191,6 +1199,7 @@ def _shared_simple_env(
     neon_binpath: Path,
     pg_distrib_dir: Path,
     pg_version: PgVersion,
+    pageserver_virtual_file_io_engine: str,
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1220,6 +1229,7 @@ def _shared_simple_env(
         preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
         test_name=request.node.name,
         test_output_dir=test_output_dir,
+        pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
     ) as builder:
         env = builder.init_start()
 
@@ -1258,6 +1268,7 @@ def neon_env_builder(
     request: FixtureRequest,
     test_overlay_dir: Path,
     top_output_dir: Path,
+    pageserver_virtual_file_io_engine: str,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1287,6 +1298,7 @@ def neon_env_builder(
         broker=default_broker,
         run_id=run_id,
         preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
+        pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 53350138dd..d8ac92abb6 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -8,7 +8,7 @@ from _pytest.python import Metafunc
 from fixtures.pg_version import PgVersion
 
 """
-Dynamically parametrize tests by Postgres version and build type (debug/release/remote)
+Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters
 """
 
 
@@ -31,11 +31,12 @@ def build_type(request: FixtureRequest) -> Optional[str]:
     return None
 
 
-def pytest_generate_tests(metafunc: Metafunc):
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in metafunc.definition._nodeid:
-        return
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]:
+    return None
 
+
+def pytest_generate_tests(metafunc: Metafunc):
     if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
         pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
     else:
@@ -46,5 +47,12 @@ def pytest_generate_tests(metafunc: Metafunc):
     else:
         build_types = [bt.lower()]
 
-    metafunc.parametrize("build_type", build_types)
-    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+    # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first
+    if "test_runner/performance" not in metafunc.definition._nodeid:
+        metafunc.parametrize("build_type", build_types)
+        metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+
+    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
+    # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
+    if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+        metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])

From 12e9b2a909d2382567b79fa4b27a879f0009b5ff Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Fri, 26 Jan 2024 10:56:11 +0100
Subject: [PATCH 58/70] Update plv8 (#6465)

---
 Dockerfile.compute-node | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a5c1f3157d..2414c089dc 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -144,30 +144,23 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ARG PG_VERSION
 RUN apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLV8_VERSION=3.1.5 \
-        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
-        ;; \
-      "v16") \
-        export PLV8_VERSION=3.1.8 \
-        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
-    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
+    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
     mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
+    # generate and copy upgrade scripts
+    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
+    cp upgrade/* /usr/local/pgsql/share/extension/ && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
+    # don't break computes with installed old version of plv8
+    cd /usr/local/pgsql/lib/ && \
+    ln -s plv8-3.1.10.so plv8-3.1.5.so && \
+    ln -s plv8-3.1.10.so plv8-3.1.8.so && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control

From 26c55b025582e6fbe373a2c5cb489127a2014c62 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 26 Jan 2024 12:39:20 +0000
Subject: [PATCH 59/70] Compute: fix rdkit extension build (#6488)

## Problem

`rdkit` extension build started to fail because of the changed checksum
of the Comic Neue font:

```
Downloading https://fonts.google.com/download?family=Comic%20Neue...
CMake Error at Code/cmake/Modules/RDKitUtils.cmake:257 (MESSAGE):
  The md5 checksum for /rdkit-src/Code/GraphMol/MolDraw2D/Comic_Neue.zip is
  incorrect; expected: 850b0df852f1cda4970887b540f8f333, found:
  b7fd0df73ad4637504432d72a0accb8f
```

https://github.com/neondatabase/neon/actions/runs/7666530536/job/20895534826

Ref https://neondb.slack.com/archives/C059ZC138NR/p1706265392422469

## Summary of changes
- Disable comic fonts for `rdkit` extension
---
 Dockerfile.compute-node | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 2414c089dc..299c4097e8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -52,7 +52,7 @@ RUN cd postgres && \
     # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
     # In vanilla postgres this function is limited to Postgres role superuser.
     # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
     # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
     # so we do it here.
     old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
         fi; \
     done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7, 
+    # the second loop is for pg_stat_statement extension versions >= 1.7,
     # where pg_stat_statement_reset() got 3 additional arguments
     for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
         filename=$(basename "$file"); \
         if ! echo "$old_list" | grep -q -F "$filename"; then \
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
         fi; \
-    done      
+    done
 
 #########################################################################################
 #
@@ -546,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
         -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
         -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
         -D RDK_INSTALL_INTREE=OFF \
+        -D RDK_INSTALL_COMIC_FONTS=OFF \
         -D CMAKE_BUILD_TYPE=Release \
         . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \

From 5b34d5f561f4565dd22dd5e9be58a9844c6a5476 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 26 Jan 2024 13:40:03 +0000
Subject: [PATCH 60/70] pageserver: add vectored get latency histogram (#6461)

This patch introduces a new set of grafana metrics for a histogram:
pageserver_get_vectored_seconds_bucket{task_kind="Compaction|PageRequestHandler"}.

While it has a `task_kind` label, only compaction and SLRU fetches are
tracked. This reduces the increase in cardinality to 24.

The metric should allow us to isolate performance regressions while the
vectorized get is being implemented. Once the implementation is
complete, it'll also allow us to quantify the improvements.
---
 pageserver/src/metrics.rs         | 37 +++++++++++++++++++++++++++++++
 pageserver/src/tenant/timeline.rs |  4 ++++
 2 files changed, 41 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 2cfa77f1c5..9b3679e3c2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) struct GetVectoredLatency {
+    map: EnumMap<TaskKind, Option<Histogram>>,
+}
+
+impl GetVectoredLatency {
+    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
+    // cardinality of the metric.
+    const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
+
+    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
+        self.map[task_kind].as_ref()
+    }
+}
+
+pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
+        "pageserver_get_vectored_seconds",
+        "Time spent in get_vectored",
+        &["task_kind"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric");
+
+    GetVectoredLatency {
+        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
+            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+
+            if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
+                let task_kind = task_kind.into();
+                Some(inner.with_label_values(&[task_kind]))
+            } else {
+                None
+            }
+        })),
+    }
+});
+
 pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c21fe94d01..70c6ee2042 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -678,6 +678,10 @@ impl Timeline {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
+        let _timer = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|t| t.start_timer());
+
         let mut values = BTreeMap::new();
         for range in key_ranges {
             let mut key = range.start;

From 55b7cde665294e4dfcfd0898c26f42c6c6b88d57 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jan 2024 14:40:47 +0000
Subject: [PATCH 61/70] tests: add basic coverage for sharding (#6380)

## Problem

The support for sharding in the pageserver was written before
https://github.com/neondatabase/neon/pull/6205 landed, so when it landed
we couldn't directly test sharding.

## Summary of changes

- Add `test_sharding_smoke` which tests the basics of creating a
sharding tenant, creating a timeline within it, checking that data
within it is distributed.
- Add modes to pg_regress tests for running with 4 shards as well as
with 1.
---
 pageserver/src/walingest.rs                   | 18 +++-
 test_runner/fixtures/workload.py              | 13 ++-
 .../regress/test_pageserver_restart.py        | 24 ++++--
 test_runner/regress/test_pg_regress.py        | 53 ++++++++----
 test_runner/regress/test_sharding.py          | 85 +++++++++++++++++++
 5 files changed, 170 insertions(+), 23 deletions(-)
 create mode 100644 test_runner/regress/test_sharding.py

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3183608862..5a6f9a590f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1033,7 +1033,23 @@ impl WalIngest {
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
             for blknum in 0..nblocks {
-                debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
+                // Sharding:
+                //  - src and dst are always on the same shard, because they differ only by dbNode, and
+                //    dbNode is not included in the hash inputs for sharding.
+                //  - This WAL command is replayed on all shards, but each shard only copies the blocks
+                //    that belong to it.
+                let src_key = rel_block_to_key(src_rel, blknum);
+                if !self.shard.is_key_local(&src_key) {
+                    debug!(
+                        "Skipping non-local key {} during XLOG_DBASE_CREATE",
+                        src_key
+                    );
+                    continue;
+                }
+                debug!(
+                    "copying block {} from {} ({}) to {}",
+                    blknum, src_rel, src_key, dst_rel
+                );
 
                 let content = modification
                     .tline
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 30def1194d..f29a6cbf3c 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -21,12 +21,21 @@ class Workload:
     - reads, checking we get the right data (`validate`)
     """
 
-    def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
+    def __init__(
+        self,
+        env: NeonEnv,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        branch_name: Optional[str] = None,
+    ):
         self.env = env
         self.tenant_id = tenant_id
         self.timeline_id = timeline_id
         self.table = "foo"
 
+        # By default, use the default branch name for initial tenant in NeonEnv
+        self.branch_name = branch_name or "main"
+
         self.expect_rows = 0
         self.churn_cursor = 0
 
@@ -35,7 +44,7 @@ class Workload:
     def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
         if self._endpoint is None:
             self._endpoint = self.env.endpoints.create(
-                "main",
+                self.branch_name,
                 tenant_id=self.tenant_id,
                 pageserver_id=pageserver_id,
                 endpoint_id="ep-workload",
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index c4499196b5..753898f747 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -1,4 +1,6 @@
+import random
 from contextlib import closing
+from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
@@ -141,18 +143,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 # Test that repeatedly kills and restarts the page server, while the
 # safekeeper and compute node keep running.
 @pytest.mark.timeout(540)
-def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_pageserver_chaos(
+    neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
+):
     if build_type == "debug":
         pytest.skip("times out in debug builds")
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
     # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
     message = ".*duplicated L1 layer layer=.*"
-    env.pageserver.allowed_errors.append(message)
+    for ps in env.pageservers:
+        ps.allowed_errors.append(message)
 
     # Use a tiny checkpoint distance, to create a lot of layers quickly.
     # That allows us to stress the compaction and layer flushing logic more.
@@ -192,13 +200,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
             log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
             assert int(row[0]) < int(row[1])
 
+    # We run "random" kills using a fixed seed, to improve reproducibility if a test
+    # failure is related to a particular order of operations.
+    seed = 0xDEADBEEF
+    rng = random.Random(seed)
+
     # Update the whole table, then immediately kill and restart the pageserver
     for i in range(1, 15):
         endpoint.safe_psql("UPDATE foo set updates = updates + 1")
 
         # This kills the pageserver immediately, to simulate a crash
-        env.pageserver.stop(immediate=True)
-        env.pageserver.start()
+        to_kill = rng.choice(env.pageservers)
+        to_kill.stop(immediate=True)
+        to_kill.start()
 
         # Check that all the updates are visible
         num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index f26d04e2f3..e4219ec7a6 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -2,25 +2,40 @@
 # This file runs pg_regress-based tests.
 #
 from pathlib import Path
+from typing import Optional
 
-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    check_restored_datadir_content,
+)
+from fixtures.remote_storage import s3_storage
 
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    """
+    :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
+                        many shards.
+    """
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_pg_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_pg_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
@@ -61,22 +76,25 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_isolation", "empty")
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start(
-        "test_isolation", config_lines=["max_prepared_transactions=100"]
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
     endpoint.safe_psql("CREATE DATABASE isolation_regression")
 
     # Create some local directories for pg_isolation_regress to run in.
@@ -114,19 +132,24 @@ def test_isolation(
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_sql_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_sql_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_sql_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
new file mode 100644
index 0000000000..c16bfc2ec6
--- /dev/null
+++ b/test_runner/regress/test_sharding.py
@@ -0,0 +1,85 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.remote_storage import s3_storage
+from fixtures.types import TimelineId
+from fixtures.workload import Workload
+
+
+def test_sharding_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basic lifecycle of a sharded tenant:
+     - ingested data gets split up
+     - page service reads
+     - timeline creation and deletion
+     - splits
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+
+    # 1MiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 128
+
+    # Use S3-compatible remote storage so that we can scrub: this test validates
+    # that the scrubber doesn't barf when it sees a sharded tenant.
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+
+    neon_env_builder.preserve_database_files = True
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+    shards = env.attachment_service.locate(tenant_id)
+
+    def get_sizes():
+        sizes = {}
+        for shard in shards:
+            node_id = int(shard["node_id"])
+            pageserver = pageservers[node_id]
+            sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[
+                "current_physical_size"
+            ]
+        log.info(f"sizes = {sizes}")
+        return sizes
+
+    # Test that timeline creation works on a sharded tenant
+    timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
+
+    # Test that we can write data to a sharded tenant
+    workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b")
+    workload.init()
+
+    sizes_before = get_sizes()
+    workload.write_rows(256)
+
+    # Test that we can read data back from a sharded tenant
+    workload.validate()
+
+    # Validate that the data is spread across pageservers
+    sizes_after = get_sizes()
+    # Our sizes increased when we wrote data
+    assert sum(sizes_after.values()) > sum(sizes_before.values())
+    # That increase is present on all shards
+    assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers)
+
+    # Validate that timeline list API works properly on all shards
+    for shard in shards:
+        node_id = int(shard["node_id"])
+        pageserver = pageservers[node_id]
+        timelines = set(
+            TimelineId(tl["timeline_id"])
+            for tl in pageserver.http_client().timeline_list(shard["shard_id"])
+        )
+        assert timelines == {env.initial_timeline, timeline_b}
+
+    # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service)

From 4c245b0f5abda55083bd9e4a87375185cc1f3528 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 26 Jan 2024 16:12:49 +0000
Subject: [PATCH 62/70] update_build_tools_image.yml: Push build-tools image to
 Docker Hub (#6481)

## Problem

- `docker.io/neondatabase/build-tools:pinned` image is frequently
outdated on Docker Hub because there's no automated way to update it.
- `update_build_tools_image.yml` workflow contains legacy roll-back
logic, which is not required anymore because it updates only a single
image.

## Summary of changes
- Make `update_build_tools_image.yml` workflow push images to both ECR
and Docker Hub
- Remove unneeded roll-back logic
---
 .../workflows/build_and_push_docker_image.yml |  25 +++-
 .../workflows/update_build_tools_image.yml    | 122 +++++-------------
 2 files changed, 53 insertions(+), 94 deletions(-)

diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
index e401b2f418..892e21114b 100644
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -69,7 +69,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
 
   kaniko-arm:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -85,7 +93,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
   manifest:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -99,7 +115,10 @@ jobs:
 
     steps:
       - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
       - name: Push manifest
         run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
index 88bab797b7..900724fc60 100644
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -20,111 +20,51 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
 permissions: {}
 
 jobs:
   tag-image:
     runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
 
     env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
       FROM_TAG: ${{ inputs.from-tag }}
       TO_TAG: ${{ inputs.to-tag }}
 
     steps:
-      - name: Install Crane & ECR helper
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
 
-      - name: Configure ECR login
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v2
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install crane
         run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
 
-      - name: Restore previous tag if needed
+      - name: Copy images
         run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
 
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom

From dcc7610ad67c4a1d9f00c884a044f33c0b4d1de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 26 Jan 2024 17:43:56 +0100
Subject: [PATCH 63/70] Do backoff::retry in s3 timetravel test (#6493)

The top level retries weren't enough, probably because we do so many
network requests. Fine grained retries ensure that there is higher
potential for the entire test to succeed.

To demonstrate this, consider the following example: let's assume that
each request has 5% chance of failing and we do 10 requests. Then
chances of success without any retries is 0.95^10 = 0.6. With 3 top
level retries it is 1-0.4^3 = 0.936. With 3 fine grained retries it is
(1-0.05^3)^10 = 0.9988 (roundings implicit). So chances of failure are
6.4% for the top level retry vs 0.12% for the fine grained retry.

Follow-up of #6155
---
 libs/remote_storage/tests/test_real_s3.rs | 62 ++++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 9e1b989e4d..679be66bf7 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,4 +1,5 @@
 use std::env;
+use std::fmt::{Debug, Display};
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
@@ -8,6 +9,7 @@ use std::{collections::HashSet, time::SystemTime};
 use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
+use futures_util::Future;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
@@ -22,6 +24,7 @@ mod common;
 mod tests_s3;
 
 use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
+use utils::backoff;
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
@@ -39,6 +42,25 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // to take the time from S3 response headers.
     const WAIT_TIME: Duration = Duration::from_millis(3_000);
 
+    async fn retry<T, O, F, E>(op: O) -> Result<T, E>
+    where
+        E: Display + Debug + 'static,
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, E>>,
+    {
+        let warn_threshold = 3;
+        let max_retries = 10;
+        backoff::retry(
+            op,
+            |_e| false,
+            warn_threshold,
+            max_retries,
+            "test retry",
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        )
+        .await
+    }
+
     async fn time_point() -> SystemTime {
         tokio::time::sleep(WAIT_TIME).await;
         let ret = SystemTime::now();
@@ -47,8 +69,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     }
 
     async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(client
-            .list_files(None)
+        Ok(retry(|| client.list_files(None))
             .await
             .context("list root files failure")?
             .into_iter()
@@ -64,16 +85,23 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
         .with_context(|| "RemotePath conversion")?;
 
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+        ctx.client.upload(data, len, &path1, None)
+    })
+    .await?;
 
     let t0_files = list_files(&ctx.client).await?;
     let t0 = time_point().await;
     println!("at t0: {t0_files:?}");
 
     let old_data = "remote blob data2";
-    let (data, len) = upload_stream(old_data.as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+
+    retry(|| {
+        let (data, len) = upload_stream(old_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
 
     let t1_files = list_files(&ctx.client).await?;
     let t1 = time_point().await;
@@ -81,7 +109,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
 
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
-        let dl = ctx.client.download(&path2).await?;
+        let dl = retry(|| ctx.client.download(&path2)).await?;
         let last_modified = dl.last_modified.unwrap();
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
@@ -92,15 +120,21 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         }
     }
 
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+        ctx.client.upload(data, len, &path3, None)
+    })
+    .await?;
 
     let new_data = "new remote blob data2";
-    let (data, len) = upload_stream(new_data.as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
 
-    ctx.client.delete(&path1).await?;
+    retry(|| {
+        let (data, len) = upload_stream(new_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
 
+    retry(|| ctx.client.delete(&path1)).await?;
     let t2_files = list_files(&ctx.client).await?;
     let t2 = time_point().await;
     println!("at t2: {t2_files:?}");
@@ -137,7 +171,9 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     assert_eq!(t0_files, t0_files_recovered);
 
     // cleanup
-    ctx.client.delete_objects(&[path1, path2, path3]).await?;
+
+    let paths = &[path1, path2, path3];
+    retry(|| ctx.client.delete_objects(paths)).await?;
 
     Ok(())
 }

From 58f6cb649e42ff9f2fb82efda8dec7dd3f947434 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 26 Jan 2024 17:20:44 +0000
Subject: [PATCH 64/70] control_plane: database persistence for
 attachment_service (#6468)

## Problem

Spun off from https://github.com/neondatabase/neon/pull/6394 -- this PR
is just the persistence parts and the changes that enable it to work
nicely


## Summary of changes

- Revert #6444 and #6450
- In neon_local, start a vanilla postgres instance for the attachment
service to use.
- Adopt `diesel` crate for database access in attachment service. This
uses raw SQL migrations as the source of truth for the schema, so it's a
soft dependency: we can switch libraries pretty easily.
- Rewrite persistence.rs to use postgres (via diesel) instead of JSON.
- Preserve JSON read+write at startup and shutdown: this enables using
the JSON format in compatibility tests, so that we don't have to commit
to our DB schema yet.
- In neon_local, run database creation + migrations before starting
attachment service
- Run the initial reconciliation in Service::spawn in the background, so
that the pageserver + attachment service don't get stuck waiting for
each other to start, when restarting both together in a test.
---
 Cargo.lock                                    |  81 ++-
 control_plane/Cargo.toml                      |   2 +
 control_plane/attachment_service/Cargo.toml   |   3 +-
 .../attachment_service/migrations/.keep       |   0
 .../down.sql                                  |   6 +
 .../up.sql                                    |  36 ++
 .../down.sql                                  |   1 +
 .../up.sql                                    |  12 +
 .../2024-01-07-212945_create_nodes/down.sql   |   1 +
 .../2024-01-07-212945_create_nodes/up.sql     |  10 +
 control_plane/attachment_service/src/http.rs  |  86 ++-
 control_plane/attachment_service/src/lib.rs   |   1 +
 control_plane/attachment_service/src/main.rs  |  46 +-
 control_plane/attachment_service/src/node.rs  |  13 +
 .../attachment_service/src/persistence.rs     | 526 +++++++++++-------
 .../attachment_service/src/schema.rs          |  27 +
 .../attachment_service/src/service.rs         | 294 ++++++----
 control_plane/src/attachment_service.rs       | 300 ++++++++--
 control_plane/src/bin/neon_local.rs           |  48 +-
 control_plane/src/local_env.rs                |   6 +-
 control_plane/src/pageserver.rs               |  27 +-
 diesel.toml                                   |   9 +
 libs/utils/src/crashsafe.rs                   |  44 +-
 pageserver/src/virtual_file.rs                |   7 +-
 test_runner/fixtures/neon_fixtures.py         |  38 +-
 test_runner/regress/test_compatibility.py     |   7 +
 .../regress/test_pageserver_generations.py    |   3 +-
 workspace_hack/Cargo.toml                     |   5 +-
 28 files changed, 1168 insertions(+), 471 deletions(-)
 create mode 100644 control_plane/attachment_service/migrations/.keep
 create mode 100644 control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
 create mode 100644 control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
 create mode 100644 control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
 create mode 100644 control_plane/attachment_service/src/schema.rs
 create mode 100644 diesel.toml

diff --git a/Cargo.lock b/Cargo.lock
index 6e91363de8..f0bcfb762a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -278,6 +278,7 @@ dependencies = [
  "camino",
  "clap",
  "control_plane",
+ "diesel",
  "futures",
  "git-version",
  "hyper",
@@ -286,7 +287,6 @@ dependencies = [
  "pageserver_client",
  "postgres_backend",
  "postgres_connection",
- "scopeguard",
  "serde",
  "serde_json",
  "thiserror",
@@ -1328,6 +1328,8 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
+ "diesel",
+ "diesel_migrations",
  "futures",
  "git-version",
  "hex",
@@ -1638,6 +1640,52 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "diesel"
+version = "2.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
+dependencies = [
+ "bitflags 2.4.1",
+ "byteorder",
+ "diesel_derives",
+ "itoa",
+ "pq-sys",
+ "serde_json",
+]
+
+[[package]]
+name = "diesel_derives"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
+dependencies = [
+ "diesel_table_macro_syntax",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.32",
+]
+
+[[package]]
+name = "diesel_migrations"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
+dependencies = [
+ "diesel",
+ "migrations_internals",
+ "migrations_macros",
+]
+
+[[package]]
+name = "diesel_table_macro_syntax"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
+dependencies = [
+ "syn 2.0.32",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -2787,6 +2835,27 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "migrations_internals"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
+dependencies = [
+ "serde",
+ "toml",
+]
+
+[[package]]
+name = "migrations_macros"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
+dependencies = [
+ "migrations_internals",
+ "proc-macro2",
+ "quote",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -3795,6 +3864,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "pq-sys"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -6623,6 +6701,7 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "diesel",
  "either",
  "fail",
  "futures-channel",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 75e5dcb7f8..09c171f1d3 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,6 +10,8 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+diesel = { version = "2.1.4", features = ["postgres"]}
+diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 743dd806c4..6fc21810bc 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -14,7 +14,6 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -26,6 +25,8 @@ tracing.workspace = true
 # a parsing function when loading pageservers from neon_local LocalEnv
 postgres_backend.workspace = true
 
+diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
 control_plane = { path = ".." }
diff --git a/control_plane/attachment_service/migrations/.keep b/control_plane/attachment_service/migrations/.keep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
new file mode 100644
index 0000000000..a9f5260911
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
new file mode 100644
index 0000000000..d68895b1a7
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+    EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+                    FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+    IF (
+        NEW IS DISTINCT FROM OLD AND
+        NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+    ) THEN
+        NEW.updated_at := current_timestamp;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
new file mode 100644
index 0000000000..b875b91c00
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
@@ -0,0 +1 @@
+DROP TABLE tenant_shards;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
new file mode 100644
index 0000000000..585dbc79a0
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -0,0 +1,12 @@
+CREATE TABLE tenant_shards (
+  tenant_id VARCHAR NOT NULL,
+  shard_number INTEGER NOT NULL,
+  shard_count INTEGER NOT NULL,
+  PRIMARY KEY(tenant_id, shard_number, shard_count),
+  shard_stripe_size INTEGER NOT NULL,
+  generation INTEGER NOT NULL,
+  generation_pageserver BIGINT NOT NULL,
+  placement_policy VARCHAR NOT NULL,
+  -- config is JSON encoded, opaque to the database.
+  config TEXT NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
new file mode 100644
index 0000000000..ec303bc8cf
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
@@ -0,0 +1 @@
+DROP TABLE nodes;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
new file mode 100644
index 0000000000..9be0880fa4
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
@@ -0,0 +1,10 @@
+CREATE TABLE nodes (
+  node_id BIGINT PRIMARY KEY NOT NULL,
+
+  scheduling_policy VARCHAR NOT NULL,
+
+  listen_http_addr VARCHAR NOT NULL,
+  listen_http_port INTEGER NOT NULL,
+  listen_pg_addr VARCHAR NOT NULL,
+  listen_pg_port INTEGER NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 30f6dd66ee..81f21a8e7a 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,5 @@
 use crate::reconciler::ReconcileError;
-use crate::service::Service;
+use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
@@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.inspect(inspect_req))
 }
 
-async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state.service.tenant_create(create_req).await?,
-    )
+    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }
 
-async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_timeline_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
-
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_timeline_create(tenant_id, create_req)
             .await?,
     )
 }
 
-async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_locate(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -154,14 +154,15 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }
 
-async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_shard_migrate(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_shard_migrate(tenant_shard_id, migrate_req)
             .await?,
     )
@@ -178,6 +179,35 @@ impl From<ReconcileError> for ApiError {
     }
 }
 
+/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
+/// be allowed to run if Service has finished its initial reconciliation.
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
+{
+    let state = get_state(&request);
+    let service = state.service.clone();
+
+    let startup_complete = service.startup_complete.clone();
+    if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
+        .await
+        .is_err()
+    {
+        // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
+        // timeouts around its remote calls, to bound its runtime.
+        return Err(ApiError::Timeout(
+            "Timed out waiting for service readiness".into(),
+        ));
+    }
+
+    request_span(
+        request,
+        |request| async move { handler(service, request).await },
+    )
+    .await
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
@@ -205,14 +235,20 @@ pub fn make_router(
         .put("/node/:node_id/config", |r| {
             request_span(r, handle_node_configure)
         })
-        .post("/tenant", |r| request_span(r, handle_tenant_create))
-        .post("/tenant/:tenant_id/timeline", |r| {
-            request_span(r, handle_tenant_timeline_create)
+        .post("/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_create)
+        })
+        .post("/v1/tenant/:tenant_id/timeline", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_create)
         })
         .get("/tenant/:tenant_id/locate", |r| {
-            request_span(r, handle_tenant_locate)
+            tenant_service_handler(r, handle_tenant_locate)
         })
         .put("/tenant/:tenant_shard_id/migrate", |r| {
-            request_span(r, handle_tenant_shard_migrate)
+            tenant_service_handler(r, handle_tenant_shard_migrate)
         })
+        // Path aliases for tests_forward_compatibility
+        // TODO: remove these in future PR
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
 }
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index e4ca9aa304..082afb4157 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -7,6 +7,7 @@ mod node;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
+mod schema;
 pub mod service;
 mod tenant_state;
 
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 38e51b9a9e..05a3895dfa 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
+use tokio::signal::unix::SignalKind;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};
 
 use utils::{project_build_tag, project_git_version, tcp_listener};
 
@@ -40,6 +40,10 @@ struct Cli {
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
     path: Utf8PathBuf,
+
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
+    #[arg(long)]
+    database_url: String,
 }
 
 #[tokio::main]
@@ -66,9 +70,14 @@ async fn main() -> anyhow::Result<()> {
         jwt_token: args.jwt_token,
     };
 
-    let persistence = Arc::new(Persistence::spawn(&args.path).await);
+    let json_path = if args.path.as_os_str().is_empty() {
+        None
+    } else {
+        Some(args.path)
+    };
+    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
 
-    let service = Service::spawn(config, persistence).await?;
+    let service = Service::spawn(config, persistence.clone()).await?;
 
     let http_listener = tcp_listener::bind(args.listen)?;
 
@@ -81,20 +90,31 @@ async fn main() -> anyhow::Result<()> {
     let router = make_router(service, auth)
         .build()
         .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+    let router_service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
 
     tracing::info!("Serving on {0}", args.listen);
 
     tokio::task::spawn(server);
 
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
+    // Wait until we receive a signal
+    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
+    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
+    let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
+    tokio::select! {
+        _ = sigint.recv() => {},
+        _ = sigterm.recv() => {},
+        _ = sigquit.recv() => {},
+    }
+    tracing::info!("Terminating on signal");
 
-    Ok(())
+    if json_path.is_some() {
+        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
+        // full postgres dumps around.
+        if let Err(e) = persistence.write_tenants_json().await {
+            tracing::error!("Failed to write JSON on shutdown: {e}")
+        }
+    }
+
+    std::process::exit(0);
 }
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index efd3f8f49b..47f61702d8 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,8 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use utils::id::NodeId;
 
+use crate::persistence::NodePersistence;
+
 #[derive(Clone)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
@@ -34,4 +36,15 @@ impl Node {
             NodeSchedulingPolicy::Pause => false,
         }
     }
+
+    pub(crate) fn to_persistent(&self) -> NodePersistence {
+        NodePersistence {
+            node_id: self.id.0 as i64,
+            scheduling_policy: self.scheduling.into(),
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port as i32,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port as i32,
+        }
+    }
 }
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index e944a2e9ed..b27bd2bf2e 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,182 +1,161 @@
-use std::{collections::HashMap, str::FromStr};
+use std::collections::HashMap;
+use std::str::FromStr;
 
-use camino::{Utf8Path, Utf8PathBuf};
-use control_plane::{
-    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
-    local_env::LocalEnv,
-};
-use pageserver_api::{
-    models::TenantConfig,
-    shard::{ShardCount, ShardNumber, TenantShardId},
-};
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use diesel::pg::PgConnection;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
-use tracing::info;
-use utils::{
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
+use utils::generation::Generation;
+use utils::id::{NodeId, TenantId};
 
-use crate::{node::Node, PlacementPolicy};
+use crate::node::Node;
+use crate::PlacementPolicy;
 
-/// Placeholder for storage.  This will be replaced with a database client.
+/// ## What do we store?
+///
+/// The attachment service does not store most of its state durably.
+///
+/// The essential things to store durably are:
+/// - generation numbers, as these must always advance monotonically to ensure data safety.
+/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
+/// - Node's scheduling policies, as the source of truth for these is something external.
+///
+/// Other things we store durably as an implementation detail:
+/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
+///   but it is operationally simpler to make this service the authority for which nodes
+///   it talks to.
+///
+/// ## Performance/efficiency
+///
+/// The attachment service does not go via the database for most things: there are
+/// a couple of places where we must, and where efficiency matters:
+/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
+///   before it can attach a tenant, so this acts as a bound on how fast things like
+///   failover can happen.
+/// - Pageserver re-attach: we will increment many shards' generations when this happens,
+///   so it is important to avoid e.g. issuing O(N) queries.
+///
+/// Database calls relating to nodes have low performance requirements, as they are very rarely
+/// updated, and reads of nodes are always from memory, not the database.  We only require that
+/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    inner: std::sync::Mutex<Inner>,
-}
-
-struct Inner {
-    state: PersistentState,
-    write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
+    database_url: String,
+
+    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
+    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
+    // compatible just yet.
+    json_path: Option<Utf8PathBuf>,
 }
 
+/// Legacy format, for use in JSON compat objects in test environment
 #[derive(Serialize, Deserialize)]
-struct PersistentState {
+struct JsonPersistence {
     tenants: HashMap<TenantShardId, TenantShardPersistence>,
 }
 
-struct PendingWrite {
-    bytes: Vec<u8>,
-    done_tx: tokio::sync::oneshot::Sender<()>,
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum DatabaseError {
+    #[error(transparent)]
+    Query(#[from] diesel::result::Error),
+    #[error(transparent)]
+    Connection(#[from] diesel::result::ConnectionError),
+    #[error("Logical error: {0}")]
+    Logical(String),
 }
 
-impl PersistentState {
-    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = format!("{}", tenant_id);
-                tenant.config = serde_json::to_string(&TenantConfig::default())?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
-            }
-        }
-
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Utf8Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path);
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path);
-                Self {
-                    tenants: HashMap::new(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
-            }
-        }
-    }
-}
+pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
-    pub async fn spawn(path: &Utf8Path) -> Self {
-        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
-        let state = PersistentState::load_or_new(path).await;
-        tokio::spawn(Self::writer_task(rx, path.to_owned()));
+    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
         Self {
-            inner: std::sync::Mutex::new(Inner {
-                state,
-                write_queue_tx: tx,
-            }),
+            database_url,
+            json_path,
         }
     }
 
-    async fn writer_task(
-        mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
-        path: Utf8PathBuf,
-    ) {
-        scopeguard::defer! {
-            info!("persistence writer task exiting");
-        };
-        loop {
-            match rx.recv().await {
-                Some(write) => {
-                    tokio::task::spawn_blocking({
-                        let path = path.clone();
-                        move || {
-                            let tmp_path =
-                                utils::crashsafe::path_with_suffix_extension(&path, "___new");
-                            utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
-                        }
-                    })
-                    .await
-                    .expect("spawn_blocking")
-                    .expect("write file");
-                    let _ = write.done_tx.send(()); // receiver may lose interest any time
-                }
-                None => {
-                    return;
-                }
-            }
-        }
-    }
-
-    /// Perform a modification on our [`PersistentState`].
-    /// Return a future that completes once our modification has been persisted.
-    /// The output of the future is the return value of the `txn`` closure.
-    async fn mutating_transaction<F, R>(&self, txn: F) -> R
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PersistentState) -> R,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
     {
-        let (ret, done_rx) = {
-            let mut inner = self.inner.lock().unwrap();
-            let ret = txn(&mut inner.state);
-            let (done_tx, done_rx) = tokio::sync::oneshot::channel();
-            let write = PendingWrite {
-                bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
-                done_tx,
-            };
-            inner
-                .write_queue_tx
-                .send(write)
-                .expect("writer task always outlives self");
-            (ret, done_rx)
-        };
-        // the write task can go away once we start .await'ing
-        let _: () = done_rx.await.expect("writer task dead, check logs");
-        ret
+        let database_url = self.database_url.clone();
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            // TODO: connection pooling, such as via diesel::r2d2
+            let mut conn = PgConnection::establish(&database_url)?;
+            func(&mut conn)
+        })
+        .await
+        .expect("Task panic")
     }
 
-    /// When registering a node, persist it so that on next start we will be able to
-    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
-    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
-        // TODO: node persitence will come with database backend
-        Ok(())
+    /// When a node is first registered, persist it before using it for anything
+    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
+        let np = node.to_persistent();
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
+        .await
     }
 
-    /// At startup, we populate the service's list of nodes, and use this list to call into
-    /// each node to do an initial reconciliation of the state of the world with our in-memory
-    /// observed state.
-    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
-        let env = LocalEnv::load_config()?;
-        // TODO: node persitence will come with database backend
+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
+        let nodes: Vec<Node> = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table
+                    .load::<NodePersistence>(conn)?
+                    .into_iter()
+                    .map(|n| Node {
+                        id: NodeId(n.node_id as u64),
+                        // At startup we consider a node offline until proven otherwise.
+                        availability: NodeAvailability::Offline,
+                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                            .expect("Bad scheduling policy in DB"),
+                        listen_http_addr: n.listen_http_addr,
+                        listen_http_port: n.listen_http_port as u16,
+                        listen_pg_addr: n.listen_pg_addr,
+                        listen_pg_port: n.listen_pg_port as u16,
+                    })
+                    .collect::<Vec<Node>>())
+            })
+            .await?;
 
-        // XXX hack: enable test_backward_compatibility to work by populating our list of
+        if nodes.is_empty() {
+            return self.list_nodes_local_env().await;
+        }
+
+        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
+
+        Ok(nodes)
+    }
+
+    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
+    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
+        // Enable test_backward_compatibility to work by populating our list of
         // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
         // first startup in the compat test, we may have shards but no nodes.
-        let mut result = Vec::new();
+        use control_plane::local_env::LocalEnv;
+        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
         tracing::info!(
-            "Loaded {} pageserver nodes from LocalEnv",
+            "Loading {} pageserver nodes from LocalEnv",
             env.pageservers.len()
         );
+        let mut nodes = Vec::new();
         for ps_conf in env.pageservers {
             let (pg_host, pg_port) =
                 parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
             let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
                 .expect("Unable to parse listen_http_addr");
-            result.push(Node {
+            let node = Node {
                 id: ps_conf.id,
                 listen_pg_addr: pg_host.to_string(),
                 listen_pg_port: pg_port.unwrap_or(5432),
@@ -184,16 +163,96 @@ impl Persistence {
                 listen_http_port: http_port.unwrap_or(80),
                 availability: NodeAvailability::Active,
                 scheduling: NodeSchedulingPolicy::Active,
-            });
+            };
+
+            // Synchronize database with what we learn from LocalEnv
+            self.insert_node(&node).await?;
+
+            nodes.push(node);
         }
 
-        Ok(result)
+        Ok(nodes)
     }
 
-    /// At startup, we populate our map of tenant shards from persistent storage.
-    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
-        let inner = self.inner.lock().unwrap();
-        Ok(inner.state.tenants.values().cloned().collect())
+    /// At startup, load the high level state for shards, such as their config + policy.  This will
+    /// be enriched at runtime with state discovered on pageservers.
+    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let loaded = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
+            .await?;
+
+        if loaded.is_empty() {
+            if let Some(path) = &self.json_path {
+                if tokio::fs::try_exists(path)
+                    .await
+                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
+                {
+                    tracing::info!("Importing from legacy JSON format at {path}");
+                    return self.list_tenant_shards_json(path).await;
+                }
+            }
+        }
+        Ok(loaded)
+    }
+
+    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
+    pub(crate) async fn list_tenant_shards_json(
+        &self,
+        path: &Utf8Path,
+    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let bytes = tokio::fs::read(path)
+            .await
+            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
+
+        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
+            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+            }
+        }
+
+        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
+
+        // Synchronize database with what is in the JSON file
+        self.insert_tenant_shards(tenants.clone()).await?;
+
+        Ok(tenants)
+    }
+
+    /// For use in testing environments, where we dump out JSON on shutdown.
+    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
+        let Some(path) = &self.json_path else {
+            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
+        };
+        tracing::info!("Writing state to {path}...");
+        let tenants = self.list_tenant_shards().await?;
+        let mut tenants_map = HashMap::new();
+        for tsp in tenants {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+
+            tenants_map.insert(tenant_shard_id, tsp);
+        }
+        let json = serde_json::to_string(&JsonPersistence {
+            tenants: tenants_map,
+        })?;
+
+        tokio::fs::write(path, &json).await?;
+        tracing::info!("Wrote {} bytes to {path}...", json.len());
+
+        Ok(())
     }
 
     /// Tenants must be persisted before we schedule them for the first time.  This enables us
@@ -201,22 +260,79 @@ impl Persistence {
     pub(crate) async fn insert_tenant_shards(
         &self,
         shards: Vec<TenantShardPersistence>,
-    ) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            for shard in shards {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
-                    shard_number: ShardNumber(shard.shard_number as u8),
-                    shard_count: ShardCount(shard.shard_count as u8),
-                };
-
-                locked.tenants.insert(tenant_shard_id, shard);
-            }
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
+                Ok(())
+            })?;
             Ok(())
         })
         .await
     }
 
+    /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
+    /// the tenant from memory on this server.
+    #[allow(unused)]
+    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await
+    }
+
+    /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
+    /// batched increment of the generations of all tenants whose generation_pageserver is equal to
+    /// the node that called /re-attach.
+    #[tracing::instrument(skip_all, fields(node_id))]
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
+
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
+
+                // TODO: UPDATE+SELECT in one query
+
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        let mut result = HashMap::new();
+        for tsp in updated {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
+                    .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+        }
+
+        Ok(result)
+    }
+
     /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
     /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
     /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
@@ -225,47 +341,46 @@ impl Persistence {
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
     ) -> anyhow::Result<Generation> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;
 
-            shard.generation += 1;
-            shard.generation_pageserver = Some(node_id);
+                Ok(updated)
+            })
+            .await?;
 
-            let gen = Generation::new(shard.generation);
-            Ok(gen)
-        })
-        .await
+        Ok(Generation::new(updated.generation as u32))
     }
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
-            shard.generation_pageserver = None;
-            shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
-            Ok(())
-        })
-        .await
-    }
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| {
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .set((
+                    generation_pageserver.eq(i64::MAX),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;
 
-    pub(crate) async fn re_attach(
-        &self,
-        node_id: NodeId,
-    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
-        self.mutating_transaction(|locked| {
-            let mut result = HashMap::new();
-            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
-                if shard.generation_pageserver == Some(node_id) {
-                    shard.generation += 1;
-                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
-                }
-            }
-            Ok(result)
+            Ok(updated)
         })
-        .await
+        .await?;
+
+        Ok(())
     }
 
     // TODO: when we start shard splitting, we must durably mark the tenant so that
@@ -285,7 +400,8 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
+#[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) tenant_id: String,
@@ -296,16 +412,28 @@ pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) shard_stripe_size: i32,
 
-    // Currently attached pageserver
-    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<NodeId>,
-
     // Latest generation number: next time we attach, increment this
     // and use the incremented number when attaching
-    pub(crate) generation: u32,
+    pub(crate) generation: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: i64,
 
     #[serde(default)]
     pub(crate) placement_policy: String,
     #[serde(default)]
     pub(crate) config: String,
 }
+
+/// Parts of [`crate::node::Node`] that are stored durably
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[diesel(table_name = crate::schema::nodes)]
+pub(crate) struct NodePersistence {
+    pub(crate) node_id: i64,
+    pub(crate) scheduling_policy: String,
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: i32,
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: i32,
+}
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
new file mode 100644
index 0000000000..de80fc8f64
--- /dev/null
+++ b/control_plane/attachment_service/src/schema.rs
@@ -0,0 +1,27 @@
+// @generated automatically by Diesel CLI.
+
+diesel::table! {
+    nodes (node_id) {
+        node_id -> Int8,
+        scheduling_policy -> Varchar,
+        listen_http_addr -> Varchar,
+        listen_http_port -> Int4,
+        listen_pg_addr -> Varchar,
+        listen_pg_port -> Int4,
+    }
+}
+
+diesel::table! {
+    tenant_shards (tenant_id, shard_number, shard_count) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        shard_stripe_size -> Int4,
+        generation -> Int4,
+        generation_pageserver -> Int8,
+        placement_policy -> Varchar,
+        config -> Text,
+    }
+}
+
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index c9ed07ae5f..ec56dc8ad4 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -11,6 +11,7 @@ use control_plane::attachment_service::{
     TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use diesel::result::DatabaseErrorKind;
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -26,6 +27,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use utils::{
+    completion::Barrier,
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId},
@@ -35,7 +37,7 @@ use utils::{
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::{Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -46,6 +48,10 @@ use crate::{
 
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+/// How long [`Service::startup_reconcile`] is allowed to take before it should give
+/// up on unresponsive pageservers and proceed.
+pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantState>,
@@ -79,10 +85,27 @@ pub struct Config {
     pub jwt_token: Option<String>,
 }
 
+impl From<DatabaseError> for ApiError {
+    fn from(err: DatabaseError) -> ApiError {
+        match err {
+            DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
+            // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
+            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
+            DatabaseError::Logical(reason) => {
+                ApiError::InternalServerError(anyhow::anyhow!(reason))
+            }
+        }
+    }
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
     persistence: Arc<Persistence>,
+
+    /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
+    /// passes, it isn't safe to do any actions that mutate tenants.
+    pub(crate) startup_complete: Barrier,
 }
 
 impl From<ReconcileWaitError> for ApiError {
@@ -96,77 +119,32 @@ impl From<ReconcileWaitError> for ApiError {
 }
 
 impl Service {
-    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-        tracing::info!("Loading nodes from database...");
-        let mut nodes = persistence.list_nodes().await?;
-        tracing::info!("Loaded {} nodes from database.", nodes.len());
-
-        tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
-        tracing::info!(
-            "Loaded {} shards from database.",
-            tenant_shard_persistence.len()
-        );
-
-        let mut tenants = BTreeMap::new();
-
-        for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                // Note that we load generation, but don't care about generation_pageserver.  We will either end up finding
-                // our existing attached location and it will match generation_pageserver, or we will attach somewhere new
-                // and update generation_pageserver in the process.
-                generation: Generation::new(tsp.generation),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent: IntentState::new(),
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-            };
-
-            tenants.insert(tenant_shard_id, new_tenant);
-        }
+    pub fn get_config(&self) -> &Config {
+        &self.config
+    }
 
+    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
+    /// until this is done.
+    async fn startup_reconcile(&self) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
         // TODO: issue these requests concurrently
-        for node in &mut nodes {
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+        for node in nodes.values() {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
 
             tracing::info!("Scanning shards on node {}...", node.id);
             match client.list_location_config().await {
                 Err(e) => {
                     tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                    // TODO: be more tolerant, apply a generous 5-10 second timeout
-                    // TODO: setting a node to Offline is a dramatic thing to do, and can
-                    // prevent neon_local from starting up (it starts this service before
-                    // any pageservers are  running).  It may make sense to give nodes
-                    // a Pending state to accomodate this situation, and allow (but deprioritize)
-                    // scheduling on Pending nodes.
-                    //node.availability = NodeAvailability::Offline;
+                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
+                    // pageserver is being restarted at the same time as we are
                 }
                 Ok(listing) => {
                     tracing::info!(
@@ -174,7 +152,6 @@ impl Service {
                         listing.tenant_shards.len(),
                         node.id
                     );
-                    node.availability = NodeAvailability::Active;
 
                     for (tenant_shard_id, conf_opt) in listing.tenant_shards {
                         observed.insert(tenant_shard_id, (node.id, conf_opt));
@@ -186,41 +163,46 @@ impl Service {
         let mut cleanup = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        for (tenant_shard_id, (node_id, observed_loc)) in observed {
-            let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                cleanup.push((tenant_shard_id, node_id));
-                continue;
-            };
+        let shard_count = {
+            let mut locked = self.inner.write().unwrap();
+            for (tenant_shard_id, (node_id, observed_loc)) in observed {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push((tenant_shard_id, node_id));
+                    continue;
+                };
 
-            tenant_state
-                .observed
-                .locations
-                .insert(node_id, ObservedStateLocation { conf: observed_loc });
-        }
-
-        // State of nodes is now frozen, transform to a HashMap.
-        let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
-
-        // Populate each tenant's intent state
-        let mut scheduler = Scheduler::new(&tenants, &nodes);
-        for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-            tenant_state.intent_from_observed();
-            if let Err(e) = tenant_state.schedule(&mut scheduler) {
-                // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
-                // not enough pageservers are available.  The tenant may well still be available
-                // to clients.
-                tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
             }
-        }
+
+            // Populate each tenant's intent state
+            let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
+            for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
+                tenant_state.intent_from_observed();
+                if let Err(e) = tenant_state.schedule(&mut scheduler) {
+                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
+                    // not enough pageservers are available.  The tenant may well still be available
+                    // to clients.
+                    tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                }
+            }
+
+            locked.tenants.len()
+        };
+
+        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
+        // generation_pageserver in the database.
 
         // Clean up any tenants that were found on pageservers but are not known to us.
         for (tenant_shard_id, node_id) in cleanup {
             // A node reported a tenant_shard_id which is unknown to us: detach it.
             let node = nodes
-                .get_mut(&node_id)
+                .get(&node_id)
                 .expect("Always exists: only known nodes are scanned");
 
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             match client
                 .location_config(
                     tenant_shard_id,
@@ -252,13 +234,80 @@ impl Service {
             }
         }
 
-        let shard_count = tenants.len();
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted.
+        let reconcile_tasks = self.reconcile_all();
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
+        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        tracing::info!("Loading nodes from database...");
+        let nodes = persistence.list_nodes().await?;
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        tracing::info!("Loaded {} nodes from database.", nodes.len());
+
+        tracing::info!("Loading shards from database...");
+        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        tracing::info!(
+            "Loaded {} shards from database.",
+            tenant_shard_persistence.len()
+        );
+
+        let mut tenants = BTreeMap::new();
+
+        for tsp in tenant_shard_persistence {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            let shard_identity = if tsp.shard_count == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                ShardIdentity::new(
+                    ShardNumber(tsp.shard_number as u8),
+                    ShardCount(tsp.shard_count as u8),
+                    ShardStripeSize(tsp.shard_stripe_size as u32),
+                )?
+            };
+
+            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
+            // it with what we can infer: the node for which a generation was most recently issued.
+            let mut intent = IntentState::new();
+            if tsp.generation_pageserver != i64::MAX {
+                intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
+            }
+
+            let new_tenant = TenantState {
+                tenant_shard_id,
+                shard: shard_identity,
+                sequence: Sequence::initial(),
+                generation: Generation::new(tsp.generation as u32),
+                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+                intent,
+                observed: ObservedState::new(),
+                config: serde_json::from_str(&tsp.config).unwrap(),
+                reconciler: None,
+                waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                last_error: Arc::default(),
+            };
+
+            tenants.insert(tenant_shard_id, new_tenant);
+        }
+
+        let (startup_completion, startup_complete) = utils::completion::channel();
+
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 result_tx, nodes, tenants,
             ))),
             config,
             persistence,
+            startup_complete,
         });
 
         let result_task_this = this.clone();
@@ -316,11 +365,13 @@ impl Service {
             }
         });
 
-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted.
-        let reconcile_tasks = this.reconcile_all();
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        let startup_reconcile_this = this.clone();
+        tokio::task::spawn(async move {
+            // Block the [`Service::startup_complete`] barrier until we're done
+            let _completion = startup_completion;
+
+            startup_reconcile_this.startup_reconcile().await
+        });
 
         Ok(this)
     }
@@ -336,7 +387,6 @@ impl Service {
             let locked = self.inner.write().unwrap();
             !locked.tenants.contains_key(&attach_req.tenant_shard_id)
         };
-
         if insert {
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
@@ -344,22 +394,39 @@ impl Service {
                 shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: 0,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
             };
 
-            self.persistence.insert_tenant_shards(vec![tsp]).await?;
+            match self.persistence.insert_tenant_shards(vec![tsp]).await {
+                Err(e) => match e {
+                    DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        DatabaseErrorKind::UniqueViolation,
+                        _,
+                    )) => {
+                        tracing::info!(
+                            "Raced with another request to insert tenant {}",
+                            attach_req.tenant_shard_id
+                        )
+                    }
+                    _ => return Err(e.into()),
+                },
+                Ok(()) => {
+                    tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
 
-            let mut locked = self.inner.write().unwrap();
-            locked.tenants.insert(
-                attach_req.tenant_shard_id,
-                TenantState::new(
-                    attach_req.tenant_shard_id,
-                    ShardIdentity::unsharded(),
-                    PlacementPolicy::Single,
-                ),
-            );
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(
+                        attach_req.tenant_shard_id,
+                        TenantState::new(
+                            attach_req.tenant_shard_id,
+                            ShardIdentity::unsharded(),
+                            PlacementPolicy::Single,
+                        ),
+                    );
+                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
+                }
+            }
         }
 
         let new_generation = if let Some(req_node_id) = attach_req.node_id {
@@ -506,6 +573,14 @@ impl Service {
                     id: req_tenant.id,
                     valid,
                 });
+            } else {
+                // After tenant deletion, we may approve any validation.  This avoids
+                // spurious warnings on the pageserver if it has pending LSN updates
+                // at the point a deletion happens.
+                response.tenants.push(ValidateResponseTenant {
+                    id: req_tenant.id,
+                    valid: true,
+                });
             }
         }
         response
@@ -561,7 +636,7 @@ impl Service {
                 shard_count: tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
             })
@@ -967,10 +1042,7 @@ impl Service {
             availability: NodeAvailability::Active,
         };
         // TODO: idempotency if the node already exists in the database
-        self.persistence
-            .insert_node(&new_node)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+        self.persistence.insert_node(&new_node).await?;
 
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 2d43c46270..6602aa9a73 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,5 +1,11 @@
 use crate::{background_process, local_env::LocalEnv};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
+use diesel::{
+    backend::Backend,
+    query_builder::{AstPass, QueryFragment, QueryId},
+    Connection, PgConnection, QueryResult, RunQueryDsl,
+};
+use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
     models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
@@ -7,9 +13,9 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
-use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{path::PathBuf, str::FromStr};
+use std::{env, str::FromStr};
+use tokio::process::Command;
 use tracing::instrument;
 use utils::{
     auth::{Claims, Scope},
@@ -19,14 +25,17 @@ use utils::{
 pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
-    path: PathBuf,
+    path: Utf8PathBuf,
     jwt_token: Option<String>,
     public_key_path: Option<Utf8PathBuf>,
+    postgres_port: u16,
     client: reqwest::Client,
 }
 
 const COMMAND: &str = "attachment_service";
 
+const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -169,7 +178,9 @@ pub struct TenantShardMigrateResponse {}
 
 impl AttachmentService {
     pub fn from_env(env: &LocalEnv) -> Self {
-        let path = env.base_data_dir.join("attachments.json");
+        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
+            .unwrap()
+            .join("attachments.json");
 
         // Makes no sense to construct this if pageservers aren't going to use it: assume
         // pageservers have control plane API set
@@ -181,6 +192,13 @@ impl AttachmentService {
             listen_url.port().unwrap()
         );
 
+        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
+        // port, for use by our captive postgres.
+        let postgres_port = listen_url
+            .port()
+            .expect("Control plane API setting should always have a port")
+            + 1;
+
         // Assume all pageservers have symmetric auth configuration: this service
         // expects to use one JWT token to talk to all of them.
         let ps_conf = env
@@ -209,6 +227,7 @@ impl AttachmentService {
             listen,
             jwt_token,
             public_key_path,
+            postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
@@ -220,13 +239,214 @@ impl AttachmentService {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self) -> anyhow::Result<()> {
-        let path_str = self.path.to_string_lossy();
+    /// PIDFile for the postgres instance used to store attachment service state
+    fn postgres_pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.env
+                .base_data_dir
+                .join("attachment_service_postgres.pid"),
+        )
+        .expect("non-Unicode path")
+    }
 
-        let mut args = vec!["-l", &self.listen, "-p", &path_str]
-            .into_iter()
-            .map(|s| s.to_string())
-            .collect::<Vec<_>>();
+    /// In order to access database migrations, we need to find the Neon source tree
+    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
+        // We assume that either prd or our binary is in the source tree. The former is usually
+        // true for automated test runners, the latter is usually true for developer workstations. Often
+        // both are true, which is fine.
+        let candidate_start_points = [
+            // Current working directory
+            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
+            // Directory containing the binary we're running inside
+            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
+        ];
+
+        // For each candidate start point, search through ancestors looking for a neon.git source tree root
+        for start_point in &candidate_start_points {
+            // Start from the build dir: assumes we are running out of a built neon source tree
+            for path in start_point.ancestors() {
+                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
+                // subdirectory.
+                let control_plane = path.join("control_plane");
+                if tokio::fs::try_exists(&control_plane).await? {
+                    return Ok(path.to_owned());
+                }
+            }
+        }
+
+        // Fall-through
+        Err(anyhow::anyhow!(
+            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
+        ))
+    }
+
+    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    ///
+    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// to other versions if that one isn't found.  Some automated tests create circumstances
+    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+
+        for v in prefer_versions {
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            if tokio::fs::try_exists(&path).await? {
+                return Ok(path);
+            }
+        }
+
+        // Fall through
+        anyhow::bail!(
+            "Postgres binaries not found in {}",
+            self.env.pg_distrib_dir.display()
+        );
+    }
+
+    /// Readiness check for our postgres process
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+        let bin_path = pg_bin_dir.join("pg_isready");
+        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+
+        Ok(exitcode.success())
+    }
+
+    /// Create our database if it doesn't exist, and run migrations.
+    ///
+    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
+    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
+    /// who just want to run `cargo neon_local` without knowing about diesel.
+    ///
+    /// Returns the database url
+    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        let database_url = format!(
+            "postgresql://localhost:{}/attachment_service",
+            self.postgres_port
+        );
+        println!("Running attachment service database setup...");
+        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
+            let base = ::url::Url::parse(database_url).unwrap();
+            let database = base.path_segments().unwrap().last().unwrap().to_owned();
+            let mut new_url = base.join(default_database).unwrap();
+            new_url.set_query(base.query());
+            (database, new_url.into())
+        }
+
+        #[derive(Debug, Clone)]
+        pub struct CreateDatabaseStatement {
+            db_name: String,
+        }
+
+        impl CreateDatabaseStatement {
+            pub fn new(db_name: &str) -> Self {
+                CreateDatabaseStatement {
+                    db_name: db_name.to_owned(),
+                }
+            }
+        }
+
+        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
+            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
+                out.push_sql("CREATE DATABASE ");
+                out.push_identifier(&self.db_name)?;
+                Ok(())
+            }
+        }
+
+        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
+
+        impl QueryId for CreateDatabaseStatement {
+            type QueryId = ();
+
+            const HAS_STATIC_QUERY_ID: bool = false;
+        }
+        if PgConnection::establish(&database_url).is_err() {
+            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
+            println!("Creating database: {database}");
+            let mut conn = PgConnection::establish(&postgres_url)?;
+            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
+        }
+        let mut conn = PgConnection::establish(&database_url)?;
+
+        let migrations_dir = self
+            .find_source_root()
+            .await?
+            .join("control_plane/attachment_service/migrations");
+
+        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
+        println!("Running migrations in {}", migrations.path().display());
+        HarnessWithOutput::write_to_stdout(&mut conn)
+            .run_pending_migrations(migrations)
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        println!("Migrations complete");
+
+        Ok(database_url)
+    }
+
+    pub async fn start(&self) -> anyhow::Result<()> {
+        // Start a vanilla Postgres process used by the attachment service for persistence.
+        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+            .unwrap()
+            .join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_log_path = pg_data_path.join("postgres.log");
+
+        if !tokio::fs::try_exists(&pg_data_path).await? {
+            // Initialize empty database
+            let initdb_path = pg_bin_dir.join("initdb");
+            let mut child = Command::new(&initdb_path)
+                .args(["-D", pg_data_path.as_ref()])
+                .spawn()
+                .expect("Failed to spawn initdb");
+            let status = child.wait().await?;
+            if !status.success() {
+                anyhow::bail!("initdb failed with status {status}");
+            }
+
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}", self.postgres_port),
+            )
+            .await?;
+        };
+
+        println!("Starting attachment service database...");
+        let db_start_args = [
+            "-w",
+            "-D",
+            pg_data_path.as_ref(),
+            "-l",
+            pg_log_path.as_ref(),
+            "start",
+        ];
+
+        background_process::start_process(
+            "attachment_service_db",
+            &self.env.base_data_dir,
+            pg_bin_dir.join("pg_ctl").as_std_path(),
+            db_start_args,
+            [],
+            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            || self.pg_isready(&pg_bin_dir),
+        )
+        .await?;
+
+        // Run migrations on every startup, in case something changed.
+        let database_url = self.setup_database().await?;
+
+        let mut args = vec![
+            "-l",
+            &self.listen,
+            "-p",
+            self.path.as_ref(),
+            "--database-url",
+            &database_url,
+        ]
+        .into_iter()
+        .map(|s| s.to_string())
+        .collect::<Vec<_>>();
         if let Some(jwt_token) = &self.jwt_token {
             args.push(format!("--jwt-token={jwt_token}"));
         }
@@ -235,7 +455,7 @@ impl AttachmentService {
             args.push(format!("--public-key={public_key_path}"));
         }
 
-        let result = background_process::start_process(
+        background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
             &self.env.attachment_service_bin(),
@@ -252,30 +472,46 @@ impl AttachmentService {
                 }
             },
         )
-        .await;
+        .await?;
 
-        // TODO: shouldn't we bail if we fail to spawn the process?
-        for ps_conf in &self.env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            self.node_register(NodeRegisterRequest {
-                node_id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-            })
+        Ok(())
+    }
+
+    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+
+        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        println!("Stopping attachment service database...");
+        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
+        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_stop_args)
+            .spawn()?
+            .wait()
             .await?;
+        if !stop_status.success() {
+            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+                .args(pg_status_args)
+                .spawn()?
+                .wait()
+                .await?;
+
+            // pg_ctl status returns this exit code if postgres is not running: in this case it is
+            // fine that stop failed.  Otherwise it is an error that stop failed.
+            const PG_STATUS_NOT_RUNNING: i32 = 3;
+            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
+                println!("Attachment service data base is already stopped");
+                return Ok(());
+            } else {
+                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+            }
         }
 
-        result
+        Ok(())
     }
 
-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())
-    }
     /// Simple HTTP request wrapper for calling into attachment service
     async fn dispatch<RQ, RS>(
         &self,
@@ -357,7 +593,7 @@ impl AttachmentService {
         &self,
         req: TenantCreateRequest,
     ) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+        self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
             .await
     }
 
@@ -414,7 +650,7 @@ impl AttachmentService {
     ) -> anyhow::Result<TimelineInfo> {
         self.dispatch(
             Method::POST,
-            format!("tenant/{tenant_id}/timeline"),
+            format!("v1/tenant/{tenant_id}/timeline"),
             Some(req),
         )
         .await
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 279c47398f..a5242e3dc7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
             "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
             "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
             "start" => rt.block_on(handle_start_all(sub_args, &env)),
-            "stop" => handle_stop_all(sub_args, &env),
+            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
+            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
             if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), *register)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
 
             if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("migrate", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), false)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1161,7 +1145,7 @@ async fn handle_attachment_service(
                 .map(|s| s.as_str())
                 == Some("immediate");
 
-            if let Err(e) = svc.stop(immediate) {
+            if let Err(e) = svc.stop(immediate).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
@@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let attachment_service = AttachmentService::from_env(env);
         if let Err(e) = attachment_service.start().await {
             eprintln!("attachment_service start failed: {:#}", e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match))
+            .start(&pageserver_config_overrides(sub_match), true)
             .await
         {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start(vec![]).await {
             eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false);
+            try_stop_all(env, false).await;
             exit(1);
         }
     }
     Ok(())
 }
 
-fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
 
-    try_stop_all(env, immediate);
+    try_stop_all(env, immediate).await;
 
     Ok(())
 }
 
-fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     // Stop all endpoints
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
@@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 
     if env.control_plane_api.is_some() {
         let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate) {
+        if let Err(e) = attachment_service.stop(immediate).await {
             eprintln!("attachment service stop failed: {e:#}");
         }
     }
@@ -1549,7 +1533,11 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
+                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
+                    .long("register")
+                    .default_value("true").required(false)
+                    .value_parser(value_parser!(bool))
+                    .value_name("register"))
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 4460fdd3a6..aefef47da7 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -223,7 +223,11 @@ impl LocalEnv {
     }
 
     pub fn attachment_service_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("attachment_service")
+        // Irrespective of configuration, attachment service binary is always
+        // run from the same location as neon_local.  This means that for compatibility
+        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
+        neon_local_bin_dir.join("attachment_service")
     }
 
     pub fn safekeeper_bin(&self) -> PathBuf {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 1db21c9a37..540d1185a2 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -30,6 +30,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};
 
@@ -161,8 +162,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false).await
+    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false, register).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -207,6 +208,7 @@ impl PageServerNode {
         &self,
         config_overrides: &[&str],
         update_config: bool,
+        register: bool,
     ) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
@@ -244,7 +246,26 @@ impl PageServerNode {
                 }
             },
         )
-        .await
+        .await?;
+
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
+        Ok(())
     }
 
     fn pageserver_basic_args<'a>(
diff --git a/diesel.toml b/diesel.toml
new file mode 100644
index 0000000000..30ed4444d7
--- /dev/null
+++ b/diesel.toml
@@ -0,0 +1,9 @@
+# For documentation on how to configure this file,
+# see https://diesel.rs/guides/configuring-diesel-cli
+
+[print_schema]
+file = "control_plane/attachment_service/src/schema.rs"
+custom_type_derives = ["diesel::query_builder::QueryId"]
+
+[migrations_directory]
+dir = "control_plane/attachment_service/migrations"
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 0c6855d17b..b089af4a02 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     fs::{self, File},
-    io::{self, Write},
+    io,
 };
 
 use camino::{Utf8Path, Utf8PathBuf};
@@ -112,48 +112,6 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
     tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }
 
-/// Writes a file to the specified `final_path` in a crash safe fasion
-///
-/// The file is first written to the specified tmp_path, and in a second
-/// step, the tmp path is renamed to the final path. As renames are
-/// atomic, a crash during the write operation will never leave behind a
-/// partially written file.
-///
-/// NB: an async variant of this code exists in Pageserver's VirtualFile.
-pub fn overwrite(
-    final_path: &Utf8Path,
-    tmp_path: &Utf8Path,
-    content: &[u8],
-) -> std::io::Result<()> {
-    let Some(final_path_parent) = final_path.parent() else {
-        return Err(std::io::Error::from_raw_os_error(
-            nix::errno::Errno::EINVAL as i32,
-        ));
-    };
-    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
-    let mut file = std::fs::OpenOptions::new()
-        .write(true)
-        // Use `create_new` so that, if we race with ourselves or something else,
-        // we bail out instead of causing damage.
-        .create_new(true)
-        .open(tmp_path)?;
-    file.write_all(content)?;
-    file.sync_all()?;
-    drop(file); // before the rename, that's important!
-                // renames are atomic
-    std::fs::rename(tmp_path, final_path)?;
-    // Only open final path parent dirfd now, so that this operation only
-    // ever holds one VirtualFile fd at a time.  That's important because
-    // the current `find_victim_slot` impl might pick the same slot for both
-    // VirtualFile., and it eventually does a blocking write lock instead of
-    // try_lock.
-    let final_parent_dirfd = std::fs::OpenOptions::new()
-        .read(true)
-        .open(final_path_parent)?;
-    final_parent_dirfd.sync_all()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index d200a4ba5e..066f06c88f 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -403,7 +403,12 @@ impl VirtualFile {
         Ok(vfile)
     }
 
-    /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
+    /// Writes a file to the specified `final_path` in a crash safe fasion
+    ///
+    /// The file is first written to the specified tmp_path, and in a second
+    /// step, the tmp path is renamed to the final path. As renames are
+    /// atomic, a crash during the write operation will never leave behind a
+    /// partially written file.
     pub async fn crashsafe_overwrite(
         final_path: &Utf8Path,
         tmp_path: &Utf8Path,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 142c97d5c3..bbabfeedf6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import abc
 import asyncio
+import concurrent.futures
 import filecmp
 import json
 import os
@@ -993,6 +994,11 @@ class NeonEnv:
         self.initial_timeline = config.initial_timeline
 
         attachment_service_port = self.port_distributor.get_port()
+        # Reserve the next port after attachment service for use by its postgres: this
+        # will assert out if the next port wasn't free.
+        attachment_service_pg_port = self.port_distributor.get_port()
+        assert attachment_service_pg_port == attachment_service_port + 1
+
         self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
@@ -1071,16 +1077,27 @@ class NeonEnv:
         self.neon_cli.init(cfg, force=config.config_init_force)
 
     def start(self):
-        # Start up broker, pageserver and all safekeepers
-        self.broker.try_start()
-
+        # Attachment service starts first, so that pageserver /re-attach calls don't
+        # bounce through retries on startup
         self.attachment_service.start()
 
-        for pageserver in self.pageservers:
-            pageserver.start()
+        # Start up broker, pageserver and all safekeepers
+        futs = []
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=2 + len(self.pageservers) + len(self.safekeepers)
+        ) as executor:
+            futs.append(
+                executor.submit(lambda: self.broker.try_start() or None)
+            )  # The `or None` is for the linter
 
-        for safekeeper in self.safekeepers:
-            safekeeper.start()
+            for pageserver in self.pageservers:
+                futs.append(executor.submit(lambda ps=pageserver: ps.start()))
+
+            for safekeeper in self.safekeepers:
+                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
+
+        for f in futs:
+            f.result()
 
     def stop(self, immediate=False, ps_assert_metric_no_errors=False):
         """
@@ -1652,8 +1669,10 @@ class NeonCli(AbstractNeonCli):
         id: int,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "subprocess.CompletedProcess[str]":
-        start_args = ["pageserver", "start", f"--id={id}", *overrides]
+        register_str = "true" if register else "false"
+        start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
         storage = self.env.pageserver_remote_storage
         append_pageserver_param_overrides(
             params_to_update=start_args,
@@ -2080,6 +2099,7 @@ class NeonPageserver(PgProtocol):
         self,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "NeonPageserver":
         """
         Start the page server.
@@ -2089,7 +2109,7 @@ class NeonPageserver(PgProtocol):
         assert self.running is False
 
         self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars
+            self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
         )
         self.running = True
         return self
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 1a1425f069..d5d70951be 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -138,6 +138,7 @@ def test_create_snapshot(
     for sk in env.safekeepers:
         sk.stop()
     env.pageserver.stop()
+    env.attachment_service.stop()
 
     # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
     compatibility_snapshot_dir = (
@@ -226,11 +227,17 @@ def test_forward_compatibility(
 
     try:
         neon_env_builder.num_safekeepers = 3
+        neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
             compatibility_snapshot_dir / "repo",
             neon_binpath=compatibility_neon_bin,
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
+
+        # Use current neon_local even though we're using old binaries for
+        # everything else: our test code is written for latest CLI args.
+        env.neon_local_binpath = neon_local_binpath
+
         neon_env_builder.start()
 
         check_neon_works(
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 63f6130af5..725ed63d1c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -499,7 +499,8 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     # and serve clients.
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start(
-        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
+        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+        register=False,
     )
 
     # The pageserver should provide service to clients
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b72e0f3c26..9d0f9bfcee 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,6 +29,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+diesel = { version = "2", features = ["postgres", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
@@ -108,8 +109,10 @@ regex-automata = { version = "0.4", default-features = false, features = ["dfa-o
 regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
+toml_edit = { version = "0.19", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 3a36a0a2272dac6e3e2774f6e6b2a8e326d8df6c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 19:23:53 +0100
Subject: [PATCH 65/70] fix(test suite): some tests leak child processes
 (#6497)

---
 control_plane/src/endpoint.rs              | 19 +++++++++++++++++--
 test_runner/regress/test_import.py         |  2 ++
 test_runner/regress/test_neon_local_cli.py |  2 ++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index d3b0366d31..dcad22b992 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -438,7 +438,7 @@ impl Endpoint {
     }
 
     fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
-        // TODO use background_process::stop_process instead
+        // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
         let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -583,9 +583,21 @@ impl Endpoint {
         }
 
         let child = cmd.spawn()?;
+        // set up a scopeguard to kill & wait for the child in case we panic or bail below
+        let child = scopeguard::guard(child, |mut child| {
+            println!("SIGKILL & wait the started process");
+            (|| {
+                // TODO: use another signal that can be caught by the child so it can clean up any children it spawned
+                child.kill().context("SIGKILL child")?;
+                child.wait().context("wait() for child process")?;
+                anyhow::Ok(())
+            })()
+            .with_context(|| format!("scopeguard kill&wait child {child:?}"))
+            .unwrap();
+        });
 
         // Write down the pid so we can wait for it when we want to stop
-        // TODO use background_process::start_process instead
+        // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
         let pid = child.id();
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         std::fs::write(pidfile_path, pid.to_string())?;
@@ -634,6 +646,9 @@ impl Endpoint {
             std::thread::sleep(ATTEMPT_INTERVAL);
         }
 
+        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
+        drop(scopeguard::ScopeGuard::into_inner(child));
+
         Ok(())
     }
 
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index faedf5d944..3519cbbaab 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,6 +163,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
     assert endpoint.safe_psql("select count(*) from t") == [(300000,)]
 
+    vanilla_pg.stop()
+
 
 def test_import_from_pageserver_small(
     pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 46b72fbca5..8edba49b8a 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail(
     env.neon_cli.endpoint_stop("ep1")
     # ep1 is stopped so create ep2 will succeed
     env.neon_cli.endpoint_start("ep2")
+    # cleanup
+    env.neon_cli.endpoint_stop("ep2")

From e34166a28fdd2b20b7d84c254a75c3a7819fe5b7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 Jan 2024 22:48:34 +0100
Subject: [PATCH 66/70] CI: switch back to std-fs io engine for soak time
 before next release (#6492)

PR #5824 introduced the concept of io engines in pageserver and
implemented `tokio-epoll-uring` in addition to our current method,
`std-fs`.

We used `tokio-epoll-uring` in CI for a day to get more exposure to
the code.  Now it's time to switch CI back so that we test with `std-fs`
as well, because that's what we're (still) using in production.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7445501f00..84edc4fbc9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -471,7 +471,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
 
       - name: Merge and upload coverage data
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'

From 734755eaca42e9a70da0764a380c0e5b2447325e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 27 Jan 2024 05:16:11 +0100
Subject: [PATCH 67/70] Enable nextest retries for the arm build (#6496)

Also make the NEXTEST_RETRIES declaration more local.

Requested in https://github.com/neondatabase/neon/pull/6493#issuecomment-1912110202
---
 .github/workflows/build_and_test.yml    |  3 ++-
 .github/workflows/neon_extra_builds.yml | 14 ++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 84edc4fbc9..12ed70c372 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,7 +21,6 @@ env:
   COPT: '-Werror'
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-  NEXTEST_RETRIES: 3
   # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
   E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
@@ -361,6 +360,8 @@ jobs:
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
         run: |
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index c6c2b7386a..f8fb62d3f8 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -124,12 +124,12 @@ jobs:
       # Hence keeping target/ (and general cache size) smaller
       BUILD_TYPE: release
       CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --locked --release
+      CARGO_FLAGS: --release
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -210,18 +210,20 @@ jobs:
 
       - name: Run cargo build
         run: |
-          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run cargo test
+        env:
+          NEXTEST_RETRIES: 3
         run: |
-          cargo test $CARGO_FLAGS $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -231,7 +233,7 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
     timeout-minutes: 90

From 3a8243043234500d3f4cd64270150e3295c9d167 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sun, 28 Jan 2024 00:15:11 +0100
Subject: [PATCH 68/70] fixup(#6492): also switch the benchmarks that runs on
 merge-to-main back to std-fs (#6501)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 12ed70c372..147d5cae2d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -508,7 +508,7 @@ jobs:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 

From 8253cf1931a53128a9a4f5fdd71add6f90dd2a60 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Sun, 28 Jan 2024 22:27:14 +0100
Subject: [PATCH 69/70] proxy: Relax endpoint check (#6503)

## Problem

http-over-sql allowes host to be in format api.aws.... however it's not
the case for the websocket flow.

## Summary of changes

Relax endpoint check for the ws serverless connections.
---
 proxy/src/auth/credentials.rs         | 20 ++++++++++++--------
 proxy/src/serverless.rs               |  2 ++
 proxy/src/serverless/sql_over_http.rs | 11 ++++-------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index bdb79f2517..5bf7667a1f 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,8 @@
 
 use crate::{
     auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -54,10 +55,10 @@ impl ComputeUserInfoMaybeEndpoint {
     }
 }
 
-pub fn endpoint_sni<'a>(
-    sni: &'a str,
+pub fn endpoint_sni(
+    sni: &str,
     common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
+) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
     let Some((subdomain, common_name)) = sni.split_once('.') else {
         return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
     };
@@ -66,7 +67,10 @@ pub fn endpoint_sni<'a>(
             cn: common_name.into(),
         });
     }
-    Ok(subdomain)
+    if subdomain == SERVERLESS_DRIVER_SNI {
+        return Ok(None);
+    }
+    Ok(Some(EndpointId::from(subdomain)))
 }
 
 impl ComputeUserInfoMaybeEndpoint {
@@ -85,7 +89,6 @@ impl ComputeUserInfoMaybeEndpoint {
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(EndpointId::from));
 
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
@@ -103,7 +106,7 @@ impl ComputeUserInfoMaybeEndpoint {
 
         let endpoint_from_domain = if let Some(sni_str) = sni {
             if let Some(cn) = common_names {
-                Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
+                endpoint_sni(sni_str, cn)?
             } else {
                 None
             }
@@ -117,12 +120,13 @@ impl ComputeUserInfoMaybeEndpoint {
                 Some(Err(InconsistentProjectNames { domain, option }))
             }
             // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
+            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
                 false => Err(MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
         .transpose()?;
+        ctx.set_endpoint_id(endpoint.clone());
 
         info!(%user, project = endpoint.as_deref(), "credentials");
         if sni.is_some() {
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 8af008394a..dfef4ccdfa 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
+pub const SERVERLESS_DRIVER_SNI: &str = "api";
+
 pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f108ab34ab..1e2ddaa2ff 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use anyhow::bail;
+use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -35,11 +36,11 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
-use crate::EndpointId;
 use crate::RoleName;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -61,7 +62,6 @@ enum Payload {
 
 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -188,9 +188,7 @@ fn get_conn_info(
         }
     }
 
-    let endpoint = endpoint_sni(hostname, &tls.common_names)?;
-
-    let endpoint: EndpointId = endpoint.into();
+    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
     ctx.set_endpoint_id(Some(endpoint.clone()));
 
     let pairs = connection_url.query_pairs();
@@ -227,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
     let (_, hostname_rest) = hostname
         .split_once('.')
         .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest
-        && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
+    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
 }
 
 // TODO: return different http error codes

From c1148dc9acf938d912888ecb0a4e76ed40e21ef8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 29 Jan 2024 07:39:16 +0200
Subject: [PATCH 70/70] Fix calculation of maximal multixact in
 ingest_multixact_create_record (#6502)

## Problem

See https://neondb.slack.com/archives/C06F5UJH601/p1706373716661439

## Summary of changes

Use None instead of 0 as initial accumulator value for calculating
maximal multixact XID.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/walingest.rs          | 18 ++++++++++++------
 test_runner/regress/test_next_xid.py | 12 +++++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 5a6f9a590f..93d1dcab35 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1363,16 +1363,22 @@ impl WalIngest {
             self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
             self.checkpoint_modified = true;
         }
-        let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-            if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-                mbr.xid
+        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
+            if let Some(max_xid) = acc {
+                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
+                    Some(mbr.xid)
+                } else {
+                    acc
+                }
             } else {
-                acc
+                Some(mbr.xid)
             }
         });
 
-        if self.checkpoint.update_next_xid(max_mbr_xid) {
-            self.checkpoint_modified = true;
+        if let Some(max_xid) = max_mbr_xid {
+            if self.checkpoint.update_next_xid(max_xid) {
+                self.checkpoint_modified = true;
+            }
         }
         Ok(())
     }
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index da2580dbf9..e880445c4d 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -203,6 +203,16 @@ def test_import_at_2bil(
         $$;
         """
     )
+
+    # Also create a multi-XID with members past the 2 billion mark
+    conn2 = endpoint.connect()
+    cur2 = conn2.cursor()
+    cur.execute("INSERT INTO t VALUES ('x')")
+    cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur.execute("COMMIT")
+    cur2.execute("COMMIT")
+
     # A checkpoint writes a WAL record with xl_xid=0. Many other WAL
     # records would have the same effect.
     cur.execute("checkpoint")
@@ -217,4 +227,4 @@ def test_import_at_2bil(
     conn = endpoint.connect()
     cur = conn.cursor()
     cur.execute("SELECT count(*) from t")
-    assert cur.fetchone() == (10000 + 1,)
+    assert cur.fetchone() == (10000 + 1 + 1,)