From ebddda5b7f85587998df00dbf7dc88679459b494 Mon Sep 17 00:00:00 2001
From: vladov <vladov3000@gmail.com>
Date: Thu, 5 Sep 2024 08:06:57 -0700
Subject: [PATCH 01/29] Fix precedence issue causing yielding loop to never
 yield. (#8922)

There is a bug in `yielding_loop` that causes it to never yield.

## Summary of changes

Fixed the bug. `i + 1 % interval == 0` will always evaluate to `i + 1 ==
0` which is false
([Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=68e6ca393a02113cb7720115c2842e75)).
This function is called in 2 places
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/scheduler.rs#L389)
and
[here](https://github.com/neondatabase/neon/blob/99fa1c36004d710c65a47ffefaf66b4b5c6b4ce1/pageserver/src/tenant/secondary/heatmap_uploader.rs#L152)
with `interval == 1000` in both cases.

This may change the performance of the system since now we are yielding
to tokio. Also, this may expose undefined behavior since it is now
possible for tasks to be moved between threads/whatever tokio does to
tasks. However, this was the intention of the author of the code.
---
 libs/utils/src/yielding_loop.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 963279eb4c..41c4cee45d 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -23,7 +23,7 @@ where
     for (i, item) in iter.enumerate() {
         visitor(item);
 
-        if i + 1 % interval == 0 {
+        if (i + 1) % interval == 0 {
             tokio::task::yield_now().await;
             if cancel.is_cancelled() {
                 return Err(YieldingLoopError::Cancelled);

From fd12dd942f61a0a22016fa219f4b3a87c81dc0b0 Mon Sep 17 00:00:00 2001
From: Stefan Radig <stefan@neon.tech>
Date: Thu, 5 Sep 2024 17:48:51 +0200
Subject: [PATCH 02/29] Add installation instructions for m4 on mac (#8929)

## Problem
Building on MacOS failed due to missing m4. Although a window was
popping up claiming to install m4, this was not helping.

## Summary of changes
Add instructions to install m4 using brew and link it (thanks to Folke
for helping).
---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 735edef0fc..b54956f773 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```
 
+If you get errors about missing `m4` you may have to install it manually:
+```
+brew install m4
+brew link --force m4
+```
+
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
 # recommended approach from https://www.rust-lang.org/tools/install

From 04f99a87bfee4da41df2bd5724e73b3646c2bf3e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 5 Sep 2024 19:14:21 +0100
Subject: [PATCH 03/29] storcon: make pageserver AZ id mandatory (#8856)

## Problem
https://github.com/neondatabase/neon/pull/8852 introduced a new nullable
column for the `nodes` table: `availability_zone_id`

## Summary of changes
* Make neon local and the test suite always provide an az id
* Make the az id field in the ps registration request mandatory
* Migrate the column to non-nullable and adjust in memory state
accordingly
* Remove the code that was used to populate the az id for pre-existing
nodes
---
 Dockerfile                                    |  1 +
 control_plane/storcon_cli/src/main.rs         |  2 +-
 libs/pageserver_api/src/controller_api.rs     |  2 +-
 pageserver/src/control_plane_client.rs        | 24 +++++++---
 .../down.sql                                  |  1 +
 .../up.sql                                    |  1 +
 storage_controller/src/node.rs                | 23 +++-------
 storage_controller/src/persistence.rs         | 28 +-----------
 storage_controller/src/scheduler.rs           |  2 +-
 storage_controller/src/schema.rs              |  2 +-
 storage_controller/src/service.rs             | 44 ++-----------------
 test_runner/fixtures/neon_fixtures.py         |  3 ++
 12 files changed, 41 insertions(+), 92 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
 create mode 100644 storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql

diff --git a/Dockerfile b/Dockerfile
index d3d12330c6..1efedfa9bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,6 +87,7 @@ RUN mkdir -p /data/.neon/ && \
        "pg_distrib_dir='/usr/local/'\n" \
        "listen_pg_addr='0.0.0.0:6400'\n" \
        "listen_http_addr='0.0.0.0:9898'\n" \
+       "availability_zone='local'\n" \
   > /data/.neon/pageserver.toml && \
   chown -R neon:neon /data/.neon
 
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 5cce6cf3ae..2a81a3d825 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -336,7 +336,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
-                        availability_zone_id: Some(availability_zone_id),
+                        availability_zone_id,
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 345abd69b6..6fb5a9a139 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -57,7 +57,7 @@ pub struct NodeRegisterRequest {
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
-    pub availability_zone_id: Option<String>,
+    pub availability_zone_id: String,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 56a536c387..f6d1c35a8c 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,10 +141,24 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         m.other
                     );
 
-                    let az_id = m
-                        .other
-                        .get("availability_zone_id")
-                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+                    let az_id = {
+                        let az_id_from_metadata = m
+                            .other
+                            .get("availability_zone_id")
+                            .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
+
+                        match az_id_from_metadata {
+                            Some(az_id) => Some(az_id),
+                            None => {
+                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
+                                conf.availability_zone.clone()
+                            }
+                        }
+                    };
+
+                    if az_id.is_none() {
+                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
+                    }
 
                     Some(NodeRegisterRequest {
                         node_id: conf.id,
@@ -152,7 +166,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                         listen_pg_port: m.postgres_port,
                         listen_http_addr: m.http_host,
                         listen_http_port: m.http_port,
-                        availability_zone_id: az_id,
+                        availability_zone_id: az_id.expect("Checked above"),
                     })
                 }
                 Err(e) => {
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
new file mode 100644
index 0000000000..4fcb928533
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/down.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id DROP NOT NULL;
diff --git a/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
new file mode 100644
index 0000000000..c5b4534087
--- /dev/null
+++ b/storage_controller/migrations/2024-08-28-150530_pageserver_az_not_null/up.sql
@@ -0,0 +1 @@
+ALTER TABLE nodes ALTER availability_zone_id SET NOT NULL;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 73cecc491d..cb9ce10d23 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -36,7 +36,7 @@ pub(crate) struct Node {
     listen_pg_addr: String,
     listen_pg_port: u16,
 
-    availability_zone_id: Option<String>,
+    availability_zone_id: String,
 
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
@@ -63,8 +63,9 @@ impl Node {
         self.id
     }
 
-    pub(crate) fn get_availability_zone_id(&self) -> Option<&str> {
-        self.availability_zone_id.as_deref()
+    #[allow(unused)]
+    pub(crate) fn get_availability_zone_id(&self) -> &str {
+        self.availability_zone_id.as_str()
     }
 
     pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
@@ -78,22 +79,12 @@ impl Node {
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
-        let az_ids_match = {
-            match (
-                self.availability_zone_id.as_deref(),
-                register_req.availability_zone_id.as_deref(),
-            ) {
-                (Some(current_az), Some(register_req_az)) => current_az == register_req_az,
-                _ => true,
-            }
-        };
-
-        az_ids_match
-            && self.id == register_req.node_id
+        self.id == register_req.node_id
             && self.listen_http_addr == register_req.listen_http_addr
             && self.listen_http_port == register_req.listen_http_port
             && self.listen_pg_addr == register_req.listen_pg_addr
             && self.listen_pg_port == register_req.listen_pg_port
+            && self.availability_zone_id == register_req.availability_zone_id
     }
 
     /// For a shard located on this node, populate a response object
@@ -190,7 +181,7 @@ impl Node {
         listen_http_port: u16,
         listen_pg_addr: String,
         listen_pg_port: u16,
-        availability_zone_id: Option<String>,
+        availability_zone_id: String,
     ) -> Self {
         Self {
             id,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index e801289752..6df05ebd13 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,7 +105,6 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
-    SetNodeAzId,
 }
 
 #[must_use]
@@ -325,31 +324,6 @@ impl Persistence {
         }
     }
 
-    pub(crate) async fn set_node_availability_zone_id(
-        &self,
-        input_node_id: NodeId,
-        input_az_id: String,
-    ) -> DatabaseResult<()> {
-        use crate::schema::nodes::dsl::*;
-        let updated = self
-            .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| {
-                let updated = diesel::update(nodes)
-                    .filter(node_id.eq(input_node_id.0 as i64))
-                    .set((availability_zone_id.eq(input_az_id.clone()),))
-                    .execute(conn)?;
-                Ok(updated)
-            })
-            .await?;
-
-        if updated != 1 {
-            Err(DatabaseError::Logical(format!(
-                "Node {node_id:?} not found for setting az id",
-            )))
-        } else {
-            Ok(())
-        }
-    }
-
     /// At startup, load the high level state for shards, such as their config + policy.  This will
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -1110,7 +1084,7 @@ pub(crate) struct NodePersistence {
     pub(crate) listen_http_port: i32,
     pub(crate) listen_pg_addr: String,
     pub(crate) listen_pg_port: i32,
-    pub(crate) availability_zone_id: Option<String>,
+    pub(crate) availability_zone_id: String,
 }
 
 /// Tenant metadata health status that are stored durably.
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index ef4da6861c..deb5f27226 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -528,7 +528,7 @@ pub(crate) mod test_utils {
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
-                        None,
+                        "test-az".to_string(),
                     );
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index e0f515daea..93ab774b5f 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -25,7 +25,7 @@ diesel::table! {
         listen_http_port -> Int4,
         listen_pg_addr -> Varchar,
         listen_pg_port -> Int4,
-        availability_zone_id -> Nullable<Varchar>,
+        availability_zone_id -> Varchar,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ca416095bb..2911cd5ac4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1264,7 +1264,7 @@ impl Service {
                     123,
                     "".to_string(),
                     123,
-                    None,
+                    "test_az".to_string(),
                 );
 
                 scheduler.node_upsert(&node);
@@ -4825,15 +4825,8 @@ impl Service {
         )
         .await;
 
-        if register_req.availability_zone_id.is_none() {
-            tracing::warn!(
-                "Node {} registering without specific availability zone id",
-                register_req.node_id
-            );
-        }
-
         enum RegistrationStatus {
-            Matched(Node),
+            Matched,
             Mismatched,
             New,
         }
@@ -4842,7 +4835,7 @@ impl Service {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
                 if node.registration_match(&register_req) {
-                    RegistrationStatus::Matched(node.clone())
+                    RegistrationStatus::Matched
                 } else {
                     RegistrationStatus::Mismatched
                 }
@@ -4852,41 +4845,12 @@ impl Service {
         };
 
         match registration_status {
-            RegistrationStatus::Matched(node) => {
+            RegistrationStatus::Matched => {
                 tracing::info!(
                     "Node {} re-registered with matching address",
                     register_req.node_id
                 );
 
-                if node.get_availability_zone_id().is_none() {
-                    if let Some(az_id) = register_req.availability_zone_id.clone() {
-                        tracing::info!("Extracting availability zone id from registration request for node {}: {}",
-                                       register_req.node_id, az_id);
-
-                        // Persist to the database and update in memory state. See comment below
-                        // on ordering.
-                        self.persistence
-                            .set_node_availability_zone_id(register_req.node_id, az_id)
-                            .await?;
-                        let node_with_az = Node::new(
-                            register_req.node_id,
-                            register_req.listen_http_addr,
-                            register_req.listen_http_port,
-                            register_req.listen_pg_addr,
-                            register_req.listen_pg_port,
-                            register_req.availability_zone_id,
-                        );
-
-                        let mut locked = self.inner.write().unwrap();
-                        let mut new_nodes = (*locked.nodes).clone();
-
-                        locked.scheduler.node_upsert(&node_with_az);
-                        new_nodes.insert(register_req.node_id, node_with_az);
-
-                        locked.nodes = Arc::new(new_nodes);
-                    }
-                }
-
                 return Ok(());
             }
             RegistrationStatus::Mismatched => {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2df45a7e0e..0c692ceb69 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -758,6 +758,9 @@ class NeonEnvBuilder:
         patch_script = ""
         for ps in self.env.pageservers:
             patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+            # This is a temporary to get the backward compat test happy
+            # since the compat snapshot was generated with an older version of neon local
+            patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}'  WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;"
         patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines

From cf11c8ab6aa234b59354425116da98d58fa1826d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:52:29 +0200
Subject: [PATCH 04/29] update svg_fmt to 0.4.3 (#8930)

Audited

```
diff -r -u ~/.cargo/registry/src/index.crates.io-6f17d22bba15001f/svg_fmt-0.4.{2,3}
```

fixes https://github.com/neondatabase/neon/issues/7763
---
 Cargo.lock | 5 +++--
 Cargo.toml | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 91917d5351..3f2787f15b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6094,8 +6094,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
-version = "0.4.2"
-source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
 
 [[package]]
 name = "syn"
diff --git a/Cargo.toml b/Cargo.toml
index 4fea3e8d80..2415337110 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -161,8 +161,7 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
-svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
+svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"

From 06e840b884c242550e2a5ad0e72bfa762bce1709 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Sep 2024 10:58:48 +0200
Subject: [PATCH 05/29] compact_level0_phase1: ignore access mode config,
 always do streaming-kmerge without validation (#8934)

refs https://github.com/neondatabase/neon/issues/8184

PR https://github.com/neondatabase/infra/pull/1905 enabled
streaming-kmerge without validation everywhere.

It rolls into prod sooner or in the same release as this PR.
---
 libs/pageserver_api/src/config.rs            |  43 +-----
 pageserver/src/bin/pageserver.rs             |   1 -
 pageserver/src/config.rs                     |  26 ++--
 pageserver/src/tenant/timeline/compaction.rs | 139 +------------------
 4 files changed, 22 insertions(+), 187 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index b2662c562a..1194ee93ef 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,7 +104,9 @@ pub struct ConfigToml {
     pub image_compression: ImageCompressionAlgorithm,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+    #[serde(skip_serializing)]
+    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
+    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
     pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
     pub io_buffer_alignment: usize,
 }
@@ -209,43 +211,6 @@ pub enum GetImpl {
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 /// A tenant's calcuated configuration, which is the result of merging a
 /// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
 ///
@@ -452,7 +417,7 @@ impl Default for ConfigToml {
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            compact_level0_phase1_value_access: Default::default(),
             virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
 
             io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2c60e8d7d1..59194ab4bd 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,7 +125,6 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
     info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
 
     // The tenants directory contains all the pageserver local disk state.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c159b66905..4e68e276d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,10 +174,6 @@ pub struct PageServerConf {
 
     pub l0_flush: crate::l0_flush::L0FlushConfig,
 
-    /// This flag is temporary and will be removed after gradual rollout.
-    /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess,
-
     /// Direct IO settings
     pub virtual_file_direct_io: virtual_file::DirectIoMode,
 
@@ -338,7 +334,7 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
+            compact_level0_phase1_value_access: _,
             l0_flush,
             virtual_file_direct_io,
             concurrent_tenant_warmup,
@@ -383,7 +379,6 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access,
             virtual_file_direct_io,
             io_buffer_alignment,
 
@@ -561,6 +556,16 @@ mod tests {
             .expect("parse_and_validate");
     }
 
+    #[test]
+    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
+        let input = indoc::indoc! {r#"
+            [compact_level0_phase1_value_access]
+            mode = "streaming-kmerge"
+            validate = "key-lsn-value"
+        "#};
+        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
+    }
+
     /// If there's a typo in the pageserver config, we'd rather catch that typo
     /// and fail pageserver startup than silently ignoring the typo, leaving whoever
     /// made it in the believe that their config change is effective.
@@ -637,14 +642,5 @@ mod tests {
         //         some_invalid_field = 23
         //     "#}
         // );
-
-        test!(
-            compact_level0_phase1_value_access,
-            indoc! {r#"
-                [compact_level0_phase1_value_access]
-                mode = "streaming-kmerge"
-                some_invalid_field = 23
-            "#}
-        );
     }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6b9c8386f7..a87b502cd6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,7 +19,6 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
@@ -912,137 +911,13 @@ impl Timeline {
         // we're compacting, in key, LSN order.
         // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
         // then the Value::Image is ordered before Value::WalRecord.
-        //
-        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
-        // option and validation code once we've reached confidence.
-        enum AllValuesIter<'a> {
-            PageCachedBlobIo {
-                all_keys_iter: VecIter<'a>,
-            },
-            StreamingKmergeBypassingPageCache {
-                merge_iter: MergeIterator<'a>,
-            },
-            ValidatingStreamingKmergeBypassingPageCache {
-                mode: CompactL0BypassPageCacheValidation,
-                merge_iter: MergeIterator<'a>,
-                all_keys_iter: VecIter<'a>,
-            },
-        }
-        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
-        impl AllValuesIter<'_> {
-            async fn next_all_keys_iter(
-                iter: &mut VecIter<'_>,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                let Some(DeltaEntry {
-                    key,
-                    lsn,
-                    val: value_ref,
-                    ..
-                }) = iter.next()
-                else {
-                    return Ok(None);
-                };
-                let value = value_ref.load(ctx).await?;
-                Ok(Some((*key, *lsn, value)))
-            }
-            async fn next(
-                &mut self,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                match self {
-                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
-                      Self::next_all_keys_iter(iter, ctx).await
-                    }
-                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
-                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
-                        // advance both iterators
-                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
-                        let merge_iter_item = merge_iter.next().await;
-                        // compare results & log warnings as needed
-                        macro_rules! rate_limited_warn {
-                            ($($arg:tt)*) => {{
-                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
-                                    warn!($($arg)*);
-                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
-                                }
-                                use once_cell::sync::Lazy;
-                                use utils::rate_limit::RateLimit;
-                                use std::sync::Mutex;
-                                use std::time::Duration;
-                                static LOGGED: Lazy<Mutex<RateLimit>> =
-                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                                let mut rate_limit = LOGGED.lock().unwrap();
-                                rate_limit.call(|| {
-                                    warn!($($arg)*);
-                                });
-                            }}
-                        }
-                        match (&all_keys_iter_item, &merge_iter_item) {
-                            (Err(_), Err(_)) => {
-                                // don't bother asserting equivality of the errors
-                            }
-                            (Err(all_keys), Ok(merge)) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
-                            },
-                            (Ok(all_keys), Err(merge)) => {
-                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
-                            },
-                            (Ok(None), Ok(None)) => { }
-                            (Ok(Some(all_keys)), Ok(None)) => {
-                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
-                            }
-                            (Ok(None), Ok(Some(merge))) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
-                            }
-                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
-                                match mode {
-                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
-                                    CompactL0BypassPageCacheValidation::KeyLsn => {
-                                        let all_keys = (all_keys_key, all_keys_lsn);
-                                        let merge = (merge_key, merge_lsn);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
-                                        }
-                                    }
-                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
-                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
-                                        let merge = (merge_key, merge_lsn, merge_value);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        // in case of mismatch, trust the legacy all_keys_iter_item
-                        all_keys_iter_item
-                    }.instrument(info_span!("next")).await
-                }
-            }
-        }
-        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
-            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
-                all_keys_iter: all_keys.iter(),
-            },
-            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
-                let merge_iter = {
-                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
-                    for l in deltas_to_compact.iter() {
-                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                        deltas.push(l);
-                    }
-                    MergeIterator::create(&deltas, &[], ctx)
-                };
-                match validate {
-                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
-                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
-                        mode: validate.clone(),
-                        merge_iter,
-                        all_keys_iter: all_keys.iter(),
-                    },
-                }
+        let mut all_values_iter = {
+            let mut deltas = Vec::with_capacity(deltas_to_compact.len());
+            for l in deltas_to_compact.iter() {
+                let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                deltas.push(l);
             }
+            MergeIterator::create(&deltas, &[], ctx)
         };
 
         // This iterator walks through all keys and is needed to calculate size used by each key
@@ -1119,7 +994,7 @@ impl Timeline {
         let mut keys = 0;
 
         while let Some((key, lsn, value)) = all_values_iter
-            .next(ctx)
+            .next()
             .await
             .map_err(CompactionError::Other)?
         {

From a1323231bc65539f55eb1bfd341fb65d06d0ed22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 12:40:19 +0200
Subject: [PATCH 06/29] Update Rust to 1.81.0 (#8939)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1810-2024-09-05).

Prior update was in #8667 and #8518
---
 Cargo.lock                                    | 30 +++++--------------
 Cargo.toml                                    |  2 +-
 Dockerfile.build-tools                        |  2 +-
 libs/postgres_ffi/build.rs                    |  2 +-
 libs/walproposer/build.rs                     | 21 ++++++++++---
 libs/walproposer/src/api_bindings.rs          | 10 +++----
 .../tenant/remote_timeline_client/download.rs |  3 +-
 proxy/src/console/provider/neon.rs            |  5 +---
 rust-toolchain.toml                           |  2 +-
 safekeeper/src/send_wal.rs                    |  5 ++--
 workspace_hack/Cargo.toml                     |  6 ++--
 11 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f2787f15b..634af67198 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -915,25 +915,22 @@ dependencies = [
 
 [[package]]
 name = "bindgen"
-version = "0.65.1"
+version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
+checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "lazy_static",
- "lazycell",
+ "itertools 0.12.1",
  "log",
- "peeking_take_while",
- "prettyplease 0.2.6",
+ "prettyplease 0.2.17",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
  "syn 2.0.52",
- "which",
 ]
 
 [[package]]
@@ -2949,12 +2946,6 @@ dependencies = [
  "spin 0.5.2",
 ]
 
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3977,12 +3968,6 @@ dependencies = [
  "sha2",
 ]
 
-[[package]]
-name = "peeking_take_while"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
-
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -4280,9 +4265,9 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.6"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
+checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
  "syn 2.0.52",
@@ -7628,6 +7613,7 @@ dependencies = [
  "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools 0.10.5",
+ "itertools 0.12.1",
  "lazy_static",
  "libc",
  "log",
diff --git a/Cargo.toml b/Cargo.toml
index 2415337110..5045ee0d4d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,7 @@ aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.65"
+bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d6beb61369..a9cbed85fb 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.1
+ENV RUSTC_VERSION=1.81.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index 370d9e9a6f..d3e3ce648f 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
     fn include_file(&self, filename: &str) {
         // This does the equivalent of passing bindgen::CargoCallbacks
         // to the builder .parse_callbacks() method.
-        let cargo_callbacks = bindgen::CargoCallbacks;
+        let cargo_callbacks = bindgen::CargoCallbacks::new();
         cargo_callbacks.include_file(filename)
     }
 
diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 7bb077062b..28547f52bf 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -4,7 +4,6 @@
 use std::{env, path::PathBuf, process::Command};
 
 use anyhow::{anyhow, Context};
-use bindgen::CargoCallbacks;
 
 fn main() -> anyhow::Result<()> {
     // Tell cargo to invalidate the built crate whenever the wrapper changes
@@ -64,16 +63,25 @@ fn main() -> anyhow::Result<()> {
             .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
     };
 
+    let unwind_abi_functions = [
+        "log_internal",
+        "recovery_download",
+        "start_streaming",
+        "finish_sync_safekeepers",
+        "wait_event_set",
+        "WalProposerStart",
+    ];
+
     // The bindgen::Builder is the main entry point
     // to bindgen, and lets you build up options for
     // the resulting bindings.
-    let bindings = bindgen::Builder::default()
+    let mut builder = bindgen::Builder::default()
         // The input header we would like to generate
         // bindings for.
         .header("bindgen_deps.h")
         // Tell cargo to invalidate the built crate whenever any of the
         // included header files changed.
-        .parse_callbacks(Box::new(CargoCallbacks))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
         .allowlist_type("WalProposer")
         .allowlist_type("WalProposerConfig")
         .allowlist_type("walproposer_api")
@@ -105,7 +113,12 @@ fn main() -> anyhow::Result<()> {
         .allowlist_var("WL_SOCKET_MASK")
         .clang_arg("-DWALPROPOSER_LIB")
         .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"))
+        .clang_arg(format!("-I{inc_server_path}"));
+
+    for name in unwind_abi_functions {
+        builder = builder.override_abi(bindgen::Abi::CUnwind, name);
+    }
+    let bindings = builder
         // Finish the builder and generate the bindings.
         .generate()
         // Unwrap the Result and panic on failure.
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index bbc3663402..2fbea3fe45 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
     }
 }
 
-extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
     }
 }
 
-extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
     }
 }
 
-extern "C" fn wait_event_set(
+extern "C-unwind" fn wait_event_set(
     wp: *mut WalProposer,
     timeout: ::std::os::raw::c_long,
     event_sk: *mut *mut Safekeeper,
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
     }
 }
 
-extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
     }
 }
 
-extern "C" fn log_internal(
+extern "C-unwind" fn log_internal(
     wp: *mut WalProposer,
     level: ::std::os::raw::c_int,
     line: *const ::std::os::raw::c_char,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d9725ad756..9fbe2f0da5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -548,7 +548,7 @@ pub(crate) async fn download_initdb_tar_zst(
         cancel,
     )
     .await
-    .map_err(|e| {
+    .inspect_err(|_e| {
         // Do a best-effort attempt at deleting the temporary file upon encountering an error.
         // We don't have async here nor do we want to pile on any extra errors.
         if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -556,7 +556,6 @@ pub(crate) async fn download_initdb_tar_zst(
                 warn!("error deleting temporary file {temp_path}: {e}");
             }
         }
-        e
     })?;
 
     Ok((temp_path, file))
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 33eda72e65..b004bf4ecf 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -38,10 +38,7 @@ impl Api {
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
-            Ok(v) => v,
-            Err(_) => String::new(),
-        };
+        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN").unwrap_or_default();
         Self {
             endpoint,
             caches,
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 368b8d300a..e78c4d6790 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.1"
+channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 90b1604adb..6d677f405a 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -758,9 +758,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
                 // pq_sendint32(&reply_message, xmin);
                 // pq_sendint32(&reply_message, xmin_epoch);
                 // So it is two big endian 32-bit words in low endian order!
-                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
-                hs_feedback.catalog_xmin =
-                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
+                hs_feedback.xmin = hs_feedback.xmin.rotate_left(32);
+                hs_feedback.catalog_xmin = hs_feedback.catalog_xmin.rotate_left(32);
                 self.ws_guard
                     .walsenders
                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 20693ad63d..3d2fa8c214 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -47,7 +47,8 @@ hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -101,7 +102,8 @@ either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
-itertools = { version = "0.10" }
+itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }

From e86fef05ddbc276170ec29d035d86d03e3ad4ec2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Sep 2024 13:11:17 +0100
Subject: [PATCH 07/29] storcon: track preferred AZ for each tenant shard
 (#8937)

## Problem
We want to do AZ aware scheduling, but don't have enough metadata.

## Summary of changes
Introduce a `preferred_az_id` concept for each managed tenant shard.

In a future PR, the scheduler will use this as a soft preference.
The idea is to try and keep the shard attachments within the same AZ.
Under the assumption that the compute was placed in the correct AZ,
this reduces the chances of cross AZ trafic from between compute and PS.

In terms of code changes we:
1. Add a new nullable `preferred_az_id` column to the `tenant_shards`
table. Also include an in-memory counterpart.
2. Populate the preferred az on tenant creation and shard splits.
3. Add an endpoint which allows to bulk-set preferred AZs.

(3) gives us the migration path. I'll write a script which queries the
cplane db in the region and sets the preferred az of all shards with an
active compute to the AZ of said compute. For shards without an active compute,
I'll use the AZ of the currently attached pageserver
since this is what cplane uses now to schedule computes.
---
 libs/pageserver_api/src/controller_api.rs     |  15 +-
 .../down.sql                                  |   1 +
 .../up.sql                                    |   1 +
 storage_controller/src/http.rs                |  21 +-
 storage_controller/src/persistence.rs         |  33 ++
 storage_controller/src/schema.rs              |   1 +
 storage_controller/src/service.rs             | 327 +++++++++++++-----
 storage_controller/src/tenant_shard.rs        |  15 +
 test_runner/fixtures/neon_fixtures.py         |  13 +-
 .../regress/test_storage_controller.py        |  52 +++
 10 files changed, 384 insertions(+), 95 deletions(-)
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
 create mode 100644 storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 6fb5a9a139..94104af002 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -74,6 +74,17 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsRequest {
+    #[serde(flatten)]
+    pub preferred_az_ids: HashMap<TenantShardId, String>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ShardsPreferredAzsResponse {
+    pub updated: Vec<TenantShardId>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
     pub shard_id: TenantShardId,
@@ -132,6 +143,8 @@ pub struct TenantDescribeResponseShard {
     pub is_splitting: bool,
 
     pub scheduling_policy: ShardSchedulingPolicy,
+
+    pub preferred_az_id: Option<String>,
 }
 
 /// Explicitly migrating a particular shard is a low level operation
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
new file mode 100644
index 0000000000..127972a2e4
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/down.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards DROP preferred_az_id;
diff --git a/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
new file mode 100644
index 0000000000..641a54feb2
--- /dev/null
+++ b/storage_controller/migrations/2024-09-05-104500_tenant_shard_preferred_az/up.sql
@@ -0,0 +1 @@
+ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 32882c201a..5d4d0460be 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -14,7 +14,7 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
     MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    TenantCreateRequest,
+    ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
     TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
@@ -688,6 +688,18 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
+async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state.service.update_shards_preferred_azs(azs_req).await?,
+    )
+}
+
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1174,6 +1186,13 @@ pub fn make_router(
                 RequestName("control_v1_tenant_policy"),
             )
         })
+        .put("/control/v1/preferred_azs", |r| {
+            named_request_span(
+                r,
+                handle_update_preferred_azs,
+                RequestName("control_v1_preferred_azs"),
+            )
+        })
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 6df05ebd13..1dc1040d96 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -105,6 +105,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealthOutdated,
     GetLeader,
     UpdateLeader,
+    SetPreferredAzs,
 }
 
 #[must_use]
@@ -664,6 +665,33 @@ impl Persistence {
         Ok(())
     }
 
+    pub(crate) async fn set_tenant_shard_preferred_azs(
+        &self,
+        preferred_azs: Vec<(TenantShardId, String)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, String)>> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
+            let mut shards_updated = Vec::default();
+
+            for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .set(preferred_az_id.eq(preferred_az))
+                    .execute(conn)?;
+
+                if updated == 1 {
+                    shards_updated.push((*tenant_shard_id, preferred_az.clone()));
+                }
+            }
+
+            Ok(shards_updated)
+        })
+        .await
+    }
+
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
@@ -1050,6 +1078,11 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) config: String,
     #[serde(default)]
     pub(crate) scheduling_policy: String,
+
+    // Hint that we should attempt to schedule this tenant shard the given
+    // availability zone in order to minimise the chances of cross-AZ communication
+    // with compute.
+    pub(crate) preferred_az_id: Option<String>,
 }
 
 impl TenantShardPersistence {
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 93ab774b5f..1717a9369d 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -41,6 +41,7 @@ diesel::table! {
         splitting -> Int2,
         config -> Text,
         scheduling_policy -> Varchar,
+        preferred_az_id -> Nullable<Varchar>,
     }
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2911cd5ac4..324f864291 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -25,7 +25,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
         ScheduleOptimizationAction,
@@ -41,10 +41,11 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -353,6 +354,12 @@ impl From<DatabaseError> for ApiError {
     }
 }
 
+enum InitialShardScheduleOutcome {
+    Scheduled(TenantCreateResponseShard),
+    NotScheduled,
+    ShardScheduleError(ScheduleError),
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
@@ -1452,6 +1459,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             };
 
             match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -2023,6 +2031,7 @@ impl Service {
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
+                preferred_az_id: None,
             })
             .collect();
 
@@ -2046,99 +2055,87 @@ impl Service {
         };
 
         let mut schedule_context = ScheduleContext::default();
+        let mut schedule_error = None;
+        let mut response_shards = Vec::new();
+        for tenant_shard_id in create_ids {
+            tracing::info!("Creating shard {tenant_shard_id}...");
 
-        let (waiters, response_shards) = {
+            let outcome = self
+                .do_initial_shard_scheduling(
+                    tenant_shard_id,
+                    initial_generation,
+                    &create_req.shard_parameters,
+                    create_req.config.clone(),
+                    placement_policy.clone(),
+                    &mut schedule_context,
+                )
+                .await;
+
+            match outcome {
+                InitialShardScheduleOutcome::Scheduled(resp) => response_shards.push(resp),
+                InitialShardScheduleOutcome::NotScheduled => {}
+                InitialShardScheduleOutcome::ShardScheduleError(err) => {
+                    schedule_error = Some(err);
+                }
+            }
+        }
+
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            response_shards
+                .iter()
+                .filter_map(|resp| {
+                    let az_id = locked
+                        .nodes
+                        .get(&resp.node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((resp.shard_id, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        // Note that we persist the preferred AZ for the new shards separately.
+        // In theory, we could "peek" the scheduler to determine where the shard will
+        // land, but the subsequent "real" call into the scheduler might select a different
+        // node. Hence, we do this awkward update to keep things consistent.
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            })?;
+
+        {
             let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
-
-            let mut response_shards = Vec::new();
-            let mut schcedule_error = None;
-
-            for tenant_shard_id in create_ids {
-                tracing::info!("Creating shard {tenant_shard_id}...");
-
-                use std::collections::btree_map::Entry;
-                match tenants.entry(tenant_shard_id) {
-                    Entry::Occupied(mut entry) => {
-                        tracing::info!(
-                            "Tenant shard {tenant_shard_id} already exists while creating"
-                        );
-
-                        // TODO: schedule() should take an anti-affinity expression that pushes
-                        // attached and secondary locations (independently) away frorm those
-                        // pageservers also holding a shard for this tenant.
-
-                        entry
-                            .get_mut()
-                            .schedule(scheduler, &mut schedule_context)
-                            .map_err(|e| {
-                                ApiError::Conflict(format!(
-                                    "Failed to schedule shard {tenant_shard_id}: {e}"
-                                ))
-                            })?;
-
-                        if let Some(node_id) = entry.get().intent.get_attached() {
-                            let generation = entry
-                                .get()
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-
-                        continue;
-                    }
-                    Entry::Vacant(entry) => {
-                        let state = entry.insert(TenantShard::new(
-                            tenant_shard_id,
-                            ShardIdentity::from_params(
-                                tenant_shard_id.shard_number,
-                                &create_req.shard_parameters,
-                            ),
-                            placement_policy.clone(),
-                        ));
-
-                        state.generation = initial_generation;
-                        state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
-                            schcedule_error = Some(e);
-                        }
-
-                        // Only include shards in result if we are attaching: the purpose
-                        // of the response is to tell the caller where the shards are attached.
-                        if let Some(node_id) = state.intent.get_attached() {
-                            let generation = state
-                                .generation
-                                .expect("Generation is set when in attached mode");
-                            response_shards.push(TenantCreateResponseShard {
-                                shard_id: tenant_shard_id,
-                                node_id: *node_id,
-                                generation: generation.into().unwrap(),
-                            });
-                        }
-                    }
-                };
+            for (tid, az_id) in updated {
+                if let Some(shard) = locked.tenants.get_mut(&tid) {
+                    shard.set_preferred_az(az_id);
+                }
             }
+        }
 
-            // If we failed to schedule shards, then they are still created in the controller,
-            // but we return an error to the requester to avoid a silent failure when someone
-            // tries to e.g. create a tenant whose placement policy requires more nodes than
-            // are present in the system.  We do this here rather than in the above loop, to
-            // avoid situations where we only create a subset of shards in the tenant.
-            if let Some(e) = schcedule_error {
-                return Err(ApiError::Conflict(format!(
-                    "Failed to schedule shard(s): {e}"
-                )));
-            }
+        // If we failed to schedule shards, then they are still created in the controller,
+        // but we return an error to the requester to avoid a silent failure when someone
+        // tries to e.g. create a tenant whose placement policy requires more nodes than
+        // are present in the system.  We do this here rather than in the above loop, to
+        // avoid situations where we only create a subset of shards in the tenant.
+        if let Some(e) = schedule_error {
+            return Err(ApiError::Conflict(format!(
+                "Failed to schedule shard(s): {e}"
+            )));
+        }
 
-            let waiters = tenants
+        let waiters = {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            tenants
                 .range_mut(TenantShardId::tenant_range(tenant_id))
                 .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
-                .collect::<Vec<_>>();
-            (waiters, response_shards)
+                .collect::<Vec<_>>()
         };
 
         Ok((
@@ -2149,6 +2146,78 @@ impl Service {
         ))
     }
 
+    /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the
+    /// case of a new tenant and a pre-existing one.
+    async fn do_initial_shard_scheduling(
+        &self,
+        tenant_shard_id: TenantShardId,
+        initial_generation: Option<Generation>,
+        shard_params: &ShardParameters,
+        config: TenantConfig,
+        placement_policy: PlacementPolicy,
+        schedule_context: &mut ScheduleContext,
+    ) -> InitialShardScheduleOutcome {
+        let mut locked = self.inner.write().unwrap();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        use std::collections::btree_map::Entry;
+        match tenants.entry(tenant_shard_id) {
+            Entry::Occupied(mut entry) => {
+                tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
+
+                // TODO: schedule() should take an anti-affinity expression that pushes
+                // attached and secondary locations (independently) away frorm those
+                // pageservers also holding a shard for this tenant.
+
+                if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(err);
+                }
+
+                if let Some(node_id) = entry.get().intent.get_attached() {
+                    let generation = entry
+                        .get()
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+            Entry::Vacant(entry) => {
+                let state = entry.insert(TenantShard::new(
+                    tenant_shard_id,
+                    ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params),
+                    placement_policy,
+                ));
+
+                state.generation = initial_generation;
+                state.config = config;
+                if let Err(e) = state.schedule(scheduler, schedule_context) {
+                    return InitialShardScheduleOutcome::ShardScheduleError(e);
+                }
+
+                // Only include shards in result if we are attaching: the purpose
+                // of the response is to tell the caller where the shards are attached.
+                if let Some(node_id) = state.intent.get_attached() {
+                    let generation = state
+                        .generation
+                        .expect("Generation is set when in attached mode");
+                    InitialShardScheduleOutcome::Scheduled(TenantCreateResponseShard {
+                        shard_id: tenant_shard_id,
+                        node_id: *node_id,
+                        generation: generation.into().unwrap(),
+                    })
+                } else {
+                    InitialShardScheduleOutcome::NotScheduled
+                }
+            }
+        }
+    }
+
     /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
     /// wait for reconciliation to complete before responding.
     async fn await_waiters(
@@ -3511,6 +3580,7 @@ impl Service {
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
                 scheduling_policy: *shard.get_scheduling_policy(),
+                preferred_az_id: shard.preferred_az().map(ToString::to_string),
             })
         }
 
@@ -4214,9 +4284,10 @@ impl Service {
                     config: serde_json::to_string(&config).unwrap(),
                     splitting: SplitState::Splitting,
 
-                    // Scheduling policies do not carry through to children
+                    // Scheduling policies and preferred AZ do not carry through to children
                     scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                         .unwrap(),
+                    preferred_az_id: None,
                 });
             }
 
@@ -4336,6 +4407,47 @@ impl Service {
         let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
+        // Now that we have scheduled the child shards, attempt to set their preferred AZ
+        // to that of the pageserver they've been attached on.
+        let preferred_azs = {
+            let locked = self.inner.read().unwrap();
+            child_locations
+                .iter()
+                .filter_map(|(tid, node_id, _stripe_size)| {
+                    let az_id = locked
+                        .nodes
+                        .get(node_id)
+                        .map(|n| n.get_availability_zone_id().to_string())?;
+
+                    Some((*tid, az_id))
+                })
+                .collect::<Vec<_>>()
+        };
+
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred az ids: {err}"
+                ))
+            });
+
+        match updated {
+            Ok(updated) => {
+                let mut locked = self.inner.write().unwrap();
+                for (tid, az_id) in updated {
+                    if let Some(shard) = locked.tenants.get_mut(&tid) {
+                        shard.set_preferred_az(az_id);
+                    }
+                }
+            }
+            Err(err) => {
+                tracing::warn!("Failed to persist preferred AZs after split: {err}");
+            }
+        }
+
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
@@ -6497,4 +6609,35 @@ impl Service {
     ) -> Result<(), DatabaseError> {
         self.persistence.safekeeper_upsert(record).await
     }
+
+    pub(crate) async fn update_shards_preferred_azs(
+        &self,
+        req: ShardsPreferredAzsRequest,
+    ) -> Result<ShardsPreferredAzsResponse, ApiError> {
+        let preferred_azs = req.preferred_az_ids.into_iter().collect::<Vec<_>>();
+        let updated = self
+            .persistence
+            .set_tenant_shard_preferred_azs(preferred_azs)
+            .await
+            .map_err(|err| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Failed to persist preferred AZs: {err}"
+                ))
+            })?;
+
+        let mut updated_in_mem_and_db = Vec::default();
+
+        let mut locked = self.inner.write().unwrap();
+        for (tid, az_id) in updated {
+            let shard = locked.tenants.get_mut(&tid);
+            if let Some(shard) = shard {
+                shard.set_preferred_az(az_id);
+                updated_in_mem_and_db.push(tid);
+            }
+        }
+
+        Ok(ShardsPreferredAzsResponse {
+            updated: updated_in_mem_and_db,
+        })
+    }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 30723a3b36..cdb0633e2b 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -140,6 +140,10 @@ pub(crate) struct TenantShard {
     // Support/debug tool: if something is going wrong or flapping with scheduling, this may
     // be set to a non-active state to avoid making changes while the issue is fixed.
     scheduling_policy: ShardSchedulingPolicy,
+
+    // We should attempt to schedule this shard in the provided AZ to
+    // decrease chances of cross-AZ compute.
+    preferred_az_id: Option<String>,
 }
 
 #[derive(Default, Clone, Debug, Serialize)]
@@ -463,6 +467,7 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
+            preferred_az_id: None,
         }
     }
 
@@ -1297,6 +1302,7 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+            preferred_az_id: tsp.preferred_az_id,
         })
     }
 
@@ -1312,8 +1318,17 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
+            preferred_az_id: self.preferred_az_id.clone(),
         }
     }
+
+    pub(crate) fn preferred_az(&self) -> Option<&str> {
+        self.preferred_az_id.as_deref()
+    }
+
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
+        self.preferred_az_id = Some(preferred_az_id);
+    }
 }
 
 #[cfg(test)]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c692ceb69..18fbbde637 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2560,7 +2560,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def tenant_describe(self, tenant_id: TenantId):
         """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}
         """
         response = self.request(
             "GET",
@@ -2886,6 +2886,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 return None
             raise e
 
+    def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
+        response = self.request(
+            "PUT",
+            f"{self.api}/control/v1/preferred_azs",
+            headers=self.headers(TokenScope.ADMIN),
+            json={str(tid): az for tid, az in preferred_azs.items()},
+        )
+
+        response.raise_for_status()
+        return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8da42294b0..92cd74eba5 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2512,3 +2512,55 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
                 del d[key]
 
     return compared[0] == compared[1]
+
+
+@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
+def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
+    def assign_az(ps_cfg):
+        az = f"az-{ps_cfg['id']}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tids = [TenantId.generate() for _ in range(0, 3)]
+    for tid in tids:
+        env.storage_controller.tenant_create(tid)
+
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        attached_to = shards[0]["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+
+        assert shards[0]["preferred_az_id"] == expected_az
+
+    updated = env.storage_controller.set_preferred_azs(
+        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
+    )
+
+    assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
+
+    for tid in tids:
+        shards = env.storage_controller.tenant_describe(tid)["shards"]
+        assert len(shards) == 1
+        assert shards[0]["preferred_az_id"] == "foo"
+
+    # Generate a layer to avoid shard split handling on ps from tripping
+    # up on debug assert.
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_timeline("bar", tids[0], timeline_id)
+
+    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
+    env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
+    shards = env.storage_controller.tenant_describe(tids[0])["shards"]
+    assert len(shards) == 2
+    for shard in shards:
+        attached_to = shard["node_attached"]
+        expected_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == expected_az

From cbcd4058edb7a2c2bb3bfe1a6fc1ffb0d820b870 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:33:52 +0200
Subject: [PATCH 08/29] Fix 1.82 clippy lint too_long_first_doc_paragraph
 (#8941)

Addresses the 1.82 beta clippy lint `too_long_first_doc_paragraph` by
adding newlines to the first sentence if it is short enough, and making
a short first sentence if there is the need.
---
 compute_tools/src/pg_helpers.rs                    |  7 ++++---
 libs/metrics/src/lib.rs                            |  1 +
 libs/pageserver_api/src/controller_api.rs          |  2 ++
 libs/pageserver_api/src/models.rs                  | 10 +++++++---
 libs/postgres_backend/src/lib.rs                   |  6 ++++--
 libs/postgres_connection/src/lib.rs                |  1 +
 libs/remote_storage/src/lib.rs                     |  6 +++++-
 libs/tenant_size_model/src/lib.rs                  |  7 ++++---
 libs/utils/src/circuit_breaker.rs                  |  6 ++++--
 libs/utils/src/id.rs                               |  6 ++++--
 libs/utils/src/lock_file.rs                        |  4 +++-
 libs/utils/src/pageserver_feedback.rs              |  1 +
 libs/utils/src/poison.rs                           |  2 ++
 libs/utils/src/shard.rs                            |  9 +++++----
 libs/utils/src/simple_rcu.rs                       |  7 +++----
 libs/utils/src/sync/heavier_once_cell.rs           |  4 +++-
 libs/utils/src/vec_map.rs                          |  1 +
 libs/utils/src/yielding_loop.rs                    |  7 ++++---
 pageserver/src/config.rs                           |  2 ++
 pageserver/src/context.rs                          | 10 ++++++----
 pageserver/src/pgdatadir_mapping.rs                |  8 +++++---
 pageserver/src/tenant.rs                           |  9 +++++----
 pageserver/src/tenant/metadata.rs                  |  9 +++++----
 pageserver/src/tenant/mgr.rs                       | 12 +++++++-----
 pageserver/src/tenant/remote_timeline_client.rs    |  2 ++
 .../src/tenant/remote_timeline_client/index.rs     |  1 +
 pageserver/src/tenant/storage_layer.rs             |  9 +++++----
 pageserver/src/tenant/storage_layer/delta_layer.rs |  9 +++++----
 pageserver/src/tenant/storage_layer/image_layer.rs |  8 +++++---
 pageserver/src/tenant/storage_layer/layer_desc.rs  |  6 ++++--
 pageserver/src/tenant/storage_layer/layer_name.rs  |  5 +++--
 .../src/tenant/storage_layer/merge_iterator.rs     |  8 +++++---
 .../src/tenant/storage_layer/split_writer.rs       | 14 ++++++++------
 pageserver/src/tenant/vectored_blob_io.rs          |  6 ++++--
 pageserver/src/virtual_file.rs                     |  5 +++--
 pageserver/src/walredo.rs                          | 11 +++++------
 proxy/src/stream.rs                                |  1 +
 safekeeper/src/pull_timeline.rs                    |  1 +
 safekeeper/src/receive_wal.rs                      |  6 ++++--
 safekeeper/src/state.rs                            |  8 +++++---
 safekeeper/src/timeline.rs                         |  1 +
 safekeeper/src/timeline_eviction.rs                |  8 +++++---
 safekeeper/src/timeline_guard.rs                   |  4 +++-
 safekeeper/src/timeline_manager.rs                 |  1 +
 safekeeper/src/timelines_set.rs                    |  3 ++-
 safekeeper/src/wal_backup_partial.rs               |  6 ++++--
 safekeeper/src/wal_service.rs                      |  1 +
 storage_controller/src/service.rs                  |  4 +++-
 storage_scrubber/src/garbage.rs                    |  7 ++++---
 storage_scrubber/src/metadata_stream.rs            |  4 +++-
 storage_scrubber/src/pageserver_physical_gc.rs     |  7 ++++---
 51 files changed, 180 insertions(+), 103 deletions(-)

diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 863fa9468f..b2dc265864 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -22,9 +22,10 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
-/// Escape a string for including it in a SQL literal. Wrapping the result
-/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
-/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// Escape a string for including it in a SQL literal.
+///
+/// Wrapping the result with `E'{}'` or `'{}'` is not required,
+/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
 /// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index df000cd0fb..cd4526c089 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -68,6 +68,7 @@ macro_rules! register_uint_gauge {
 static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
+///
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
 pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 94104af002..5c8dcbf571 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -147,6 +147,8 @@ pub struct TenantDescribeResponseShard {
     pub preferred_az_id: Option<String>,
 }
 
+/// Migration request for a given tenant shard to a given node.
+///
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d13d04eb1b..ffe79c8350 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -305,8 +305,10 @@ pub struct TenantConfig {
     pub lsn_lease_length_for_ts: Option<String>,
 }
 
-/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
-/// tenant config. When the first aux file written, the policy will be persisted in the
+/// The policy for the aux file storage.
+///
+/// It can be switched through `switch_aux_file_policy` tenant config.
+/// When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
@@ -896,7 +898,9 @@ pub struct WalRedoManagerStatus {
     pub process: Option<WalRedoManagerProcessStatus>,
 }
 
-/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
+/// The progress of a secondary tenant.
+///
+/// It is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 7c7c6535b3..600f1d728c 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -69,8 +69,10 @@ impl QueryError {
 }
 
 /// Returns true if the given error is a normal consequence of a network issue,
-/// or the client closing the connection. These errors can happen during normal
-/// operations, and don't indicate a bug in our code.
+/// or the client closing the connection.
+///
+/// These errors can happen during normal operations,
+/// and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
     use io::ErrorKind::*;
     matches!(
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index 9f57f3d507..ddf9f7b610 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -7,6 +7,7 @@ use std::fmt;
 use url::Host;
 
 /// Parses a string of format either `host:port` or `host` into a corresponding pair.
+///
 /// The `host` part should be a correct `url::Host`, while `port` (if present) should be
 /// a valid decimal u16 of digits only.
 pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index cc1d3e0ae4..b5b69c9faf 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -45,6 +45,8 @@ pub use azure_core::Etag;
 
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 
+/// Default concurrency limit for S3 operations
+///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -300,7 +302,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     ) -> Result<(), TimeTravelError>;
 }
 
-/// DownloadStream is sensitive to the timeout and cancellation used with the original
+/// Data part of an ongoing [`Download`].
+///
+/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
 /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
 /// with `tokio::io::copy_buf`.
 // This has 'static because safekeepers do not use cancellation tokens (yet)
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index a3e12cf0e3..974a498404 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -5,9 +5,10 @@
 mod calculation;
 pub mod svg;
 
-/// StorageModel is the input to the synthetic size calculation. It represents
-/// a tree of timelines, with just the information that's needed for the
-/// calculation. This doesn't track timeline names or where each timeline
+/// StorageModel is the input to the synthetic size calculation.
+///
+/// It represents a tree of timelines, with just the information that's needed
+/// for the calculation. This doesn't track timeline names or where each timeline
 /// begins and ends, for example. Instead, it consists of "points of interest"
 /// on the timelines. A point of interest could be the timeline start or end point,
 /// the oldest point on a timeline that needs to be retained because of PITR
diff --git a/libs/utils/src/circuit_breaker.rs b/libs/utils/src/circuit_breaker.rs
index 720ea39d4f..e1ddfd8650 100644
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -5,8 +5,10 @@ use std::{
 
 use metrics::IntCounter;
 
-/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
-/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// Circuit breakers are for operations that are expensive and fallible.
+///
+/// If a circuit breaker fails repeatedly, we will stop attempting it for some
+/// period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
     /// An identifier that enables us to log useful errors when a circuit is broken
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index db468e3054..2cda899b15 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -249,8 +249,10 @@ macro_rules! id_newtype {
     };
 }
 
-/// Neon timeline IDs are different from PostgreSQL timeline
-/// IDs. They serve a similar purpose though: they differentiate
+/// Neon timeline ID.
+///
+/// They are different from PostgreSQL timeline
+/// IDs, but serve a similar purpose: they differentiate
 /// between different "histories" of the same cluster.  However,
 /// PostgreSQL timeline IDs are a bit cumbersome, because they are only
 /// 32-bits wide, and they must be in ascending order in any given
diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs
index 59c66ca757..3a2ed3e830 100644
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -100,7 +100,9 @@ pub enum LockFileRead {
 }
 
 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
-/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// inspect its content.
+///
+/// It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
     let res = fs::OpenOptions::new().read(true).open(path);
diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index 3ddfa44f41..dede65e699 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -8,6 +8,7 @@ use tracing::{trace, warn};
 use crate::lsn::Lsn;
 
 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+///
 /// Serialized in custom flexible key/value format. In replication protocol, it
 /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
 /// Standby status update / Hot standby feedback messages.
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index 27378c69fc..c3e2fba20c 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -65,6 +65,8 @@ impl<T> Poison<T> {
     }
 }
 
+/// Armed pointer to a [`Poison`].
+///
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index f6b430657e..d146010b41 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -13,10 +13,11 @@ pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);
 
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
+/// Combination of ShardNumber and ShardCount.
+///
+/// For use within the context of a particular tenant, when we need to know which shard we're
+/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
+/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
     pub shard_number: ShardNumber,
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index ecc5353be3..01750b2aef 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -49,12 +49,11 @@ use std::sync::{RwLock, RwLockWriteGuard};
 
 use tokio::sync::watch;
 
-///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
-/// (for very long).  Storing to the Rcu updates the value, making new readers
-/// immediately see the new value, but it also waits for all current readers to
-/// finish.
+/// (for very long).
 ///
+/// Storing to the Rcu updates the value, making new readers immediately see
+/// the new value, but it also waits for all current readers to finish.
 pub struct Rcu<V> {
     inner: RwLock<RcuInner<V>>,
 }
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 1abd3d9861..dc711fb028 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -5,7 +5,9 @@ use std::sync::{
 use tokio::sync::Semaphore;
 
 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// `SemaphorePermit`.
+///
+/// Allows use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 18b2af14f1..5f0028bacd 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -7,6 +7,7 @@ pub enum VecMapOrdering {
 }
 
 /// Ordered map datastructure implemented in a Vec.
+///
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
index 41c4cee45d..68274f0631 100644
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -6,9 +6,10 @@ pub enum YieldingLoopError {
     Cancelled,
 }
 
-/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
-/// yields to avoid blocking the executor, and after resuming checks the provided
-/// cancellation token to drop out promptly on shutdown.
+/// Helper for long synchronous loops, e.g. over all tenants in the system.
+///
+/// Periodically yields to avoid blocking the executor, and after resuming
+/// checks the provided cancellation token to drop out promptly on shutdown.
 #[inline(always)]
 pub async fn yielding_loop<I, T, F>(
     interval: usize,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4e68e276d3..29a98855d3 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -180,6 +180,8 @@ pub struct PageServerConf {
     pub io_buffer_alignment: usize,
 }
 
+/// Token for authentication to safekeepers
+///
 /// We do not want to store this in a PageServerConf because the latter may be logged
 /// and/or serialized at a whim, while the token is secret. Currently this token is the
 /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 012cb8d96f..7afcf52cf2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,7 +1,9 @@
-//! This module defines `RequestContext`, a structure that we use throughout
-//! the pageserver to propagate high-level context from places
-//! that _originate_ activity down to the shared code paths at the
-//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//! Defines [`RequestContext`].
+//!
+//! It is a structure that we use throughout the pageserver to propagate
+//! high-level context from places that _originate_ activity down to the
+//! shared code paths at the heart of the pageserver. It's inspired by
+//! Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d28a214265..808d4b666e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1021,9 +1021,10 @@ impl Timeline {
 }
 
 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository. It is created by the 'begin_record'
-/// function. It is called for each WAL record, so that all the modifications
-/// by a one WAL record appear atomic.
+/// updates to the repository.
+///
+/// It is created by the 'begin_record' function. It is called for each WAL
+/// record, so that all the modifications by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
     /// The timeline this modification applies to. You can access this to
     /// read the state, but note that any pending updates are *not* reflected
@@ -2048,6 +2049,7 @@ impl<'a> DatadirModification<'a> {
 
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
+///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fb30857ddf..fd2520a42e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,8 +1,9 @@
+//! Timeline repository implementation that keeps old data in layer files, and
+//! the recent changes in ephemeral files.
 //!
-//! Timeline repository implementation that keeps old data in files on disk, and
-//! the recent changes in memory. See tenant/*_layer.rs files.
-//! The functions here are responsible for locating the correct layer for the
-//! get/put call, walking back the timeline branching history as needed.
+//! See tenant/*_layer.rs files. The functions here are responsible for locating
+//! the correct layer for the get/put call, walking back the timeline branching
+//! history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 190316df42..24440d4b35 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,7 +1,8 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
-//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
-//! this struct and it's original serialization format is still needed because they were written a
-//! long time ago.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata.
+//!
+//! It is stored in `index_part.json` managed by [`remote_timeline_client`]. For many tenants and
+//! their timelines, this struct and its original serialization format is still needed because
+//! they were written a long time ago.
 //!
 //! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
 //! versioning.
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4e6ea0c8f9..2104f41531 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -282,9 +282,10 @@ impl BackgroundPurges {
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
-/// The TenantManager is responsible for storing and mutating the collection of all tenants
-/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
-/// lives inside the TenantManager.
+/// Responsible for storing and mutating the collection of all tenants
+/// that this pageserver has state for.
+///
+/// Every Tenant and SecondaryTenant instance lives inside the TenantManager.
 ///
 /// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
 /// the same tenant twice concurrently, or trying to configure the same tenant into secondary
@@ -2346,8 +2347,9 @@ pub enum TenantMapError {
     ShuttingDown,
 }
 
-/// Guards a particular tenant_id's content in the TenantsMap.  While this
-/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
+/// Guards a particular tenant_id's content in the TenantsMap.
+///
+/// While this structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 71b766e4c7..1f9ae40af5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2184,6 +2184,8 @@ pub fn remote_timeline_path(
     remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }
 
+/// Obtains the path of the given Layer in the remote
+///
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 757fb9d032..c51ff54919 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -1,4 +1,5 @@
 //! In-memory index to track the tenant files on the remote storage.
+//!
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index a1202ad507..dac6b2f893 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -434,10 +434,11 @@ impl ReadableLayer {
     }
 }
 
-/// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
-/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
-/// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
-/// be used for cache management but not for correctness-critical checks.
+/// Layers contain a hint indicating whether they are likely to be used for reads.
+///
+/// This is a hint rather than an authoritative value, so that we do not have to update it synchronously
+/// when changing the visibility of layers (for example when creating a branch that makes some previously
+/// covered layers visible).  It should be used for cache management but not for correctness-critical checks.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum LayerVisibilityHint {
     /// A Visible layer might be read while serving a read, because there is not an image layer between it
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6a2cd94232..34f1b15138 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -136,10 +136,11 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;
 
-/// Struct representing reference to BLOB in layers. Reference contains BLOB
-/// offset, and for WAL records it also contains `will_init` flag. The flag
-/// helps to determine the range of records that needs to be applied, without
-/// reading/deserializing records themselves.
+/// Struct representing reference to BLOB in layers.
+///
+/// Reference contains BLOB offset, and for WAL records it also contains
+/// `will_init` flag. The flag helps to determine the range of records
+/// that needs to be applied, without reading/deserializing records themselves.
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 77ce1ae670..875e223c9c 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1,7 +1,9 @@
 //! An ImageLayer represents an image or a snapshot of a key-range at
-//! one particular LSN. It contains an image of all key-value pairs
-//! in its key-range. Any key that falls into the image layer's range
-//! but does not exist in the layer, does not exist.
+//! one particular LSN.
+//!
+//! It contains an image of all key-value pairs in its key-range. Any key
+//! that falls into the image layer's range but does not exist in the layer,
+//! does not exist.
 //!
 //! An image layer is stored in a file on disk. The file is stored in
 //! timelines/<timeline_id> directory.  Currently, there are no
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index cbd18e650f..e90ff3c4b2 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -12,8 +12,10 @@ use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;
 
-/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
-/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// A unique identifier of a persistent layer.
+///
+/// This is different from `LayerDescriptor`, which is only used in the benchmarks.
+/// This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 47ae556279..ffe7ca5f3e 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -217,8 +217,9 @@ impl fmt::Display for ImageLayerName {
     }
 }
 
-/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
-/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.
+///
+/// The LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
 /// over time (e.g. across shard splits or compression). The physical filenames of layers in local
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index d2c341e5ce..0831fd9530 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -226,9 +226,11 @@ impl<'a> IteratorWrapper<'a> {
     }
 }
 
-/// A merge iterator over delta/image layer iterators. When duplicated records are
-/// found, the iterator will not perform any deduplication, and the caller should handle
-/// these situation. By saying duplicated records, there are many possibilities:
+/// A merge iterator over delta/image layer iterators.
+///
+/// When duplicated records are found, the iterator will not perform any
+/// deduplication, and the caller should handle these situation. By saying
+/// duplicated records, there are many possibilities:
 ///
 /// * Two same delta at the same LSN.
 /// * Two same image at the same LSN.
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index e8deb0a1e5..7c1ac863bf 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -34,9 +34,10 @@ impl SplitWriterResult {
     }
 }
 
-/// An image writer that takes images and produces multiple image layers. The interface does not
-/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
-/// to be cleaned up)
+/// An image writer that takes images and produces multiple image layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the image layer generation
+/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
@@ -193,9 +194,10 @@ impl SplitImageLayerWriter {
     }
 }
 
-/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
-/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
-/// to be cleaned up).
+/// A delta writer that takes key-lsn-values and produces multiple delta layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
+/// there might be leftover files to be cleaned up).
 ///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 4d51dc442d..553edf6d8b 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -593,8 +593,10 @@ impl<'a> VectoredBlobReader<'a> {
     }
 }
 
-/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
+/// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+///
+/// It provides a streaming API for getting read blobs. It returns a batch when
+/// `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
 pub struct StreamingVectoredReadPlanner {
     read_builder: Option<VectoredReadBuilder>,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index ed6ff86c10..57856eea80 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,6 +1,7 @@
-//!
 //! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor. Instead, the file is opened when it's read from,
+//! a file descriptor.
+//!
+//! Instead, the file is opened when it's read from,
 //! and if too many files are open globally in the system, least-recently
 //! used ones are closed.
 //!
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 82585f9ed8..a36955fa21 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,13 +43,12 @@ use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;
 
+/// The real implementation that uses a Postgres process to
+/// perform WAL replay.
 ///
-/// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. Only one thread can use the process at a time,
-/// that is controlled by the Mutex. In the future, we might want to
-/// launch a pool of processes to allow concurrent replay of multiple
-/// records.
-///
+/// Only one thread can use the process at a time, that is controlled by the
+/// Mutex. In the future, we might want to launch a pool of processes to allow
+/// concurrent replay of multiple records.
 pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 332dc27787..c14dd18afe 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -14,6 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 
 /// Stream wrapper which implements libpq's protocol.
+///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
 /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying
 /// to pass random malformed bytes through the connection).
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 600a6bd8f0..64585f5edc 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -484,6 +484,7 @@ pub async fn validate_temp_timeline(
 }
 
 /// Move timeline from a temp directory to the main storage, and load it to the global map.
+///
 /// This operation is done under a lock to prevent bugs if several concurrent requests are
 /// trying to load the same timeline. Note that it doesn't guard against creating the
 /// timeline with the same ttid, but no one should be doing this anyway.
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index ab8c76dc17..e35f806e90 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -448,8 +448,10 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
-/// replies to reply_tx; reading from socket and writing to disk in parallel is
-/// beneficial for performance, this struct provides writing to disk part.
+/// replies to reply_tx.
+///
+/// Reading from socket and writing to disk in parallel is beneficial for
+/// performance, this struct provides the writing to disk part.
 pub struct WalAcceptor {
     tli: WalResidentTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index dca6414082..97eeae3638 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -147,9 +147,11 @@ pub struct TimelineMemState {
     pub proposer_uuid: PgUuid,
 }
 
-/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs
-/// when we update fields like commit_lsn which don't need immediate
-/// persistence. Provides transactional like API to atomically update the state.
+/// Safekeeper persistent state plus in memory layer.
+///
+/// Allows us to avoid frequent fsyncs when we update fields like commit_lsn
+/// which don't need immediate persistence. Provides transactional like API
+/// to atomically update the state.
 ///
 /// Implements Deref into *persistent* part.
 pub struct TimelineState<CTRL: control_file::Storage> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 95ee925e1a..6fd5de0ad6 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -169,6 +169,7 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
 }
 
 /// This structure is stored in shared state and represents the state of the timeline.
+///
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 5d0567575c..5aa4921a92 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -1,6 +1,8 @@
-//! Code related to evicting WAL files to remote storage. The actual upload is done by the
-//! partial WAL backup code. This file has code to delete and re-download WAL files,
-//! cross-validate with partial WAL backup if local file is still present.
+//! Code related to evicting WAL files to remote storage.
+//!
+//! The actual upload is done by the partial WAL backup code. This file has
+//! code to delete and re-download WAL files, cross-validate with partial WAL
+//! backup if local file is still present.
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index dbdf46412d..1ddac573d2 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -1,4 +1,6 @@
-//! Timeline residence guard is needed to ensure that WAL segments are present on disk,
+//! Timeline residence guard
+//!
+//! It is needed to ensure that WAL segments are present on disk,
 //! as long as the code is holding the guard. This file implements guard logic, to issue
 //! and drop guards, and to notify the manager when the guard is dropped.
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index f997f48454..6be75479db 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -1,4 +1,5 @@
 //! The timeline manager task is responsible for managing the timeline's background tasks.
+//!
 //! It is spawned alongside each timeline and exits when the timeline is deleted.
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
index d6eea79f82..096e348295 100644
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -60,7 +60,8 @@ impl TimelinesSet {
     }
 }
 
-/// Guard is used to add or remove timeline from the set.
+/// Guard is used to add or remove timelines from the set.
+///
 /// If the timeline present in set, it will be removed from it on drop.
 /// Note: do not use more than one guard for the same timeline, it caches the presence state.
 /// It is designed to be used in the manager task only.
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 4050a82fff..bddfca50e4 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -1,6 +1,8 @@
 //! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
-//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
-//! was changed), the segment will be uploaded to S3 in about 15 minutes.
+//! and `flush_lsn` updates.
+//!
+//! After the partial segment was updated (`flush_lsn` was changed), the segment
+//! will be uploaded to S3 within the configured `partial_backup_timeout`.
 //!
 //! The filename format for partial segments is
 //! `Segment_Term_Flush_Commit_skNN.partial`, where:
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 16f7748eb4..1ab54d4cce 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -17,6 +17,7 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
+///
 /// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 324f864291..e7eae647df 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -117,7 +117,9 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
 /// How long a node may be unresponsive to heartbeats during start up before we declare it
-/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
+/// offline.
+///
+/// This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
 /// handling of the re-attach response may take a long time and blocks heartbeats from
 /// being handled on the pageserver side.
 pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 3e22960f8d..d53611ed6e 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -1,6 +1,7 @@
-//! Functionality for finding and purging garbage, as in "garbage collection".  Garbage means
-//! S3 objects which are either not referenced by any metadata, or are referenced by a
-//! control plane tenant/timeline in a deleted state.
+//! Functionality for finding and purging garbage, as in "garbage collection".
+//!
+//! Garbage means S3 objects which are either not referenced by any metadata,
+//! or are referenced by a control plane tenant/timeline in a deleted state.
 
 use std::{
     collections::{HashMap, HashSet},
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 10d77937f1..f896cff2d5 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -74,7 +74,9 @@ pub async fn stream_tenant_shards<'a>(
 }
 
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
-/// using a listing. The listing is done before the stream is built, so that this
+/// using a listing.
+///
+/// The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
     remote_client: &'a GenericRemoteStorage,
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 88681e38c2..c96d9cad3b 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -440,9 +440,10 @@ async fn gc_ancestor(
     Ok(())
 }
 
-/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
-/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
-/// is about removing:
+/// Physical garbage collection: removing unused S3 objects.
+///
+/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
+/// (keys, layers).  This type of garbage collection is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
 /// - Index objects from historic generations

From e287f36a058221b7c804b4b0f440933962eb3deb Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 15:23:57 +0300
Subject: [PATCH 09/29] safekeeper: fix endpoint restart immediately after xlog
 switch.

Check that truncation point is not from the future by comparing it with
write_record_lsn, not write_lsn, and explain that xlog switch changes
their normal order.

ref https://github.com/neondatabase/neon/issues/8911
---
 safekeeper/src/safekeeper.rs             |  3 ++-
 safekeeper/src/wal_storage.rs            | 23 ++++++++++++++++++-----
 test_runner/regress/test_wal_acceptor.py | 18 ++++++++++++++++++
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index dbe0034de2..b3e006ab05 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -938,8 +938,9 @@ where
         }
 
         trace!(
-            "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
+            "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
             msg.wal_data.len(),
+            msg.h.begin_lsn,
             msg.h.end_lsn,
             msg.h.commit_lsn,
             msg.h.truncate_lsn,
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 89c2e98a94..c477fe5c7b 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -98,7 +98,19 @@ pub struct PhysicalStorage {
     /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
     write_lsn: Lsn,
 
-    /// The LSN of the last WAL record written to disk. Still can be not fully flushed.
+    /// The LSN of the last WAL record written to disk. Still can be not fully
+    /// flushed.
+    ///
+    /// Note: Normally it (and flush_record_lsn) is <= write_lsn, but after xlog
+    /// switch ingest the reverse is true because we don't bump write_lsn up to
+    /// the next segment: WAL stream from the compute doesn't have the gap and
+    /// for simplicity / as a sanity check we disallow any non-sequential
+    /// writes, so write zeros as is.
+    ///
+    /// Similar effect is in theory possible due to LSN alignment: if record
+    /// ends at *2, decoder will report end lsn as *8 even though we haven't
+    /// written these zeros yet. In practice compute likely never sends
+    /// non-aligned chunks of data.
     write_record_lsn: Lsn,
 
     /// The LSN of the last WAL record flushed to disk.
@@ -440,11 +452,12 @@ impl Storage for PhysicalStorage {
             .with_label_values(&["truncate_wal"])
             .start_timer();
 
-        // Streaming must not create a hole, so truncate cannot be called on non-written lsn
-        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
+        // Streaming must not create a hole, so truncate cannot be called on
+        // non-written lsn.
+        if self.write_record_lsn != Lsn(0) && end_pos > self.write_record_lsn {
             bail!(
-                "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
-                self.write_lsn,
+                "truncate_wal called on non-written WAL, write_record_lsn={}, end_pos={}",
+                self.write_record_lsn,
                 end_pos
             );
         }
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3785651aed..5672e836ee 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1057,6 +1057,24 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
         endpoint.start()
 
 
+# Try restarting endpoint immediately after xlog switch.
+# https://github.com/neondatabase/neon/issues/8911
+def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start("main")
+
+    endpoint.safe_psql("create table t (i int)")
+
+    endpoint.safe_psql("SELECT pg_switch_wal()")
+
+    # we want immediate shutdown to have endpoint restart on xlog switch record,
+    # so prevent shutdown checkpoint.
+    endpoint.stop(mode="immediate")
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("SELECT 'works'")
+
+
 # Context manager which logs passed time on exit.
 class DurationLogger:
     def __init__(self, desc):

From af6f63617e7421fca62ad2bf7ebfe2f0de66a793 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 6 Sep 2024 17:13:30 +0200
Subject: [PATCH 10/29] proxy: clean up code and lints for 1.81 and 1.82
 (#8945)

---
 proxy/src/cache/timed_lru.rs          |  2 +-
 proxy/src/lib.rs                      | 25 +++++++++++++++----------
 proxy/src/scram/exchange.rs           |  2 ++
 proxy/src/serverless/sql_over_http.rs |  9 +++------
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 8bb482f7c6..5b08d74696 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -16,7 +16,7 @@ use tracing::debug;
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
 use hashlink::{linked_hash_map::RawEntryMut, LruCache};
 
-use super::{common::Cached, *};
+use super::{common::Cached, timed_lru, Cache};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 8d7e586b3d..923d6ae288 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -44,16 +44,14 @@
     clippy::items_after_statements,
 )]
 // List of temporarily allowed lints.
-// TODO: Switch to except() once stable with 1.81.
 // TODO: fix code and reduce list or move to permanent list above.
-#![allow(
+#![expect(
     clippy::cargo_common_metadata,
     clippy::cast_possible_truncation,
     clippy::cast_possible_wrap,
     clippy::cast_precision_loss,
     clippy::cast_sign_loss,
     clippy::doc_markdown,
-    clippy::implicit_hasher,
     clippy::inline_always,
     clippy::match_same_arms,
     clippy::match_wild_err_arm,
@@ -61,21 +59,28 @@
     clippy::missing_panics_doc,
     clippy::module_name_repetitions,
     clippy::needless_pass_by_value,
-    clippy::needless_raw_string_hashes,
     clippy::redundant_closure_for_method_calls,
-    clippy::return_self_not_must_use,
     clippy::similar_names,
     clippy::single_match_else,
     clippy::struct_excessive_bools,
     clippy::struct_field_names,
     clippy::too_many_lines,
-    clippy::unreadable_literal,
-    clippy::unused_async,
-    clippy::unused_self,
-    clippy::wildcard_imports
+    clippy::unused_self
+)]
+#![cfg_attr(
+    any(test, feature = "testing"),
+    allow(
+        clippy::needless_raw_string_hashes,
+        clippy::unreadable_literal,
+        clippy::unused_async,
+    )
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(unknown_lints, clippy::manual_inspect)]
+#![allow(
+    unknown_lints,
+    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
+    impl_trait_overcaptures,
+)]
 
 use std::{convert::Infallible, future::Future};
 
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 786cbcaa19..afb5604666 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -217,6 +217,7 @@ impl sasl::Mechanism for Exchange<'_> {
                         self.state = ExchangeState::SaltSent(sent);
                         Ok(Step::Continue(self, msg))
                     }
+                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                     Step::Success(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
@@ -224,6 +225,7 @@ impl sasl::Mechanism for Exchange<'_> {
             ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                     Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
+                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
                     Step::Continue(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 5b36f5e91d..2188edc8c5 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -745,22 +745,20 @@ impl BatchQueryData {
             builder = builder.deferrable(true);
         }
 
-        let transaction = builder.start().await.map_err(|e| {
+        let transaction = builder.start().await.inspect_err(|_| {
             // if we cannot start a transaction, we should return immediately
             // and not return to the pool. connection is clearly broken
             discard.discard();
-            e
         })?;
 
         let json_output =
             match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                 Ok(json_output) => {
                     info!("commit");
-                    let status = transaction.commit().await.map_err(|e| {
+                    let status = transaction.commit().await.inspect_err(|_| {
                         // if we cannot commit - for now don't return connection to pool
                         // TODO: get a query status from the error
                         discard.discard();
-                        e
                     })?;
                     discard.check_idle(status);
                     json_output
@@ -776,11 +774,10 @@ impl BatchQueryData {
                 }
                 Err(err) => {
                     info!("rollback");
-                    let status = transaction.rollback().await.map_err(|e| {
+                    let status = transaction.rollback().await.inspect_err(|_| {
                         // if we cannot rollback - for now don't return connection to pool
                         // TODO: get a query status from the error
                         discard.discard();
-                        e
                     })?;
                     discard.check_idle(status);
                     return Err(err);

From 11cf16e3f363ce027e53b1834a77858d50daee0d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 19 Aug 2024 14:42:07 +0300
Subject: [PATCH 11/29] safekeeper: add term_bump endpoint.

When walproposer observes now higher term it restarts instead of
crashing whole compute with PANIC; this avoids compute crash after
term_bump call. After successfull election we're still checking
last_log_term of the highest given vote to ensure basebackup is good,
and PANIC otherwise.

It will be used for migration per
035-safekeeper-dynamic-membership-change.md
and
https://github.com/neondatabase/docs/pull/21

ref https://github.com/neondatabase/neon/issues/8700
---
 libs/safekeeper_api/src/models.rs        | 13 +++++++++
 pgxn/neon/walproposer.c                  | 24 ++++++++++-----
 safekeeper/src/auth.rs                   |  3 ++
 safekeeper/src/http/routes.rs            | 28 +++++++++++++++++-
 safekeeper/src/state.rs                  | 26 +++++++++++++++--
 safekeeper/src/timeline.rs               | 10 +++++++
 test_runner/fixtures/safekeeper/http.py  | 29 +++++++++++++++++++
 test_runner/regress/test_wal_acceptor.py | 37 ++++++++++++++++++++++++
 8 files changed, 159 insertions(+), 11 deletions(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 2fbc333075..28666d197a 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -60,3 +60,16 @@ pub struct TimelineCopyRequest {
     pub target_timeline_id: TimelineId,
     pub until_lsn: Lsn,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpRequest {
+    /// bump to
+    pub term: Option<u64>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineTermBumpResponse {
+    // before the request
+    pub previous_term: u64,
+    pub current_term: u64,
+}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index c53257923a..c1914421ec 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if previously elected leader was me;
-			 * plain restart of walproposer not intervened by concurrent
-			 * compute (who could generate WAL) is ok.
+			 * However, allow to proceed if last_log_term on the node which gave
+			 * the highest vote (i.e. point where we are going to start writing)
+			 * actually had been won by me; plain restart of walproposer not
+			 * intervened by concurrent compute which wrote WAL is ok.
+			 *
+			 * This avoids compute crash after manual term_bump.
 			 */
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
@@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk)
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
-			 * Another compute with higher term is running. Panic to restart
-			 * PG as we likely need to retake basebackup. However, don't dump
-			 * core as this is kinda expected scenario.
+			 *
+			 * Term has changed to higher one, probably another compute is
+			 * running. If this is the case we could PANIC as well because
+			 * likely it inserted some data and our basebackup is unsuitable
+			 * anymore. However, we also bump term manually (term_bump endpoint)
+			 * on safekeepers for migration purposes, in this case we do want
+			 * compute to stay alive. So restart walproposer with FATAL instead
+			 * of panicking; if basebackup is spoiled next election will notice
+			 * this.
 			 */
-			disable_core_dump();
-			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
+			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
 				   sk->appendResponse.term, wp->propTerm);
 		}
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index b8bc3f3e06..c5c9393c00 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,6 +1,9 @@
 use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;
 
+/// If tenant_id is provided, allow if token (claims) is for this tenant or
+/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
+/// SafekeeperData.
 pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
     match (&claims.scope, tenant_id) {
         (Scope::Tenant, None) => Err(AuthError(
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9b7424a818..e482edea55 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
 use utils::http::request::parse_query_param;
 
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::TimelineCreateRequest;
 use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
+use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
 use utils::{
     auth::SwappableJwtAuth,
     http::{
@@ -408,6 +408,28 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
     json_response(StatusCode::OK, response)
 }
 
+/// Make term at least as high as one in request. If one in request is None,
+/// increment current one.
+async fn timeline_term_bump_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let response = tli
+        .term_bump(request_data.term)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Used only in tests to hand craft required data.
 async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -630,6 +652,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
             |r| request_span(r, timeline_backup_partial_reset),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
+            |r| request_span(r, timeline_term_bump_handler),
+        )
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 97eeae3638..8ae749ded5 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,9 +1,10 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).
 
-use std::ops::Deref;
+use std::{cmp::max, ops::Deref};
 
 use anyhow::Result;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -12,7 +13,7 @@ use utils::{
 
 use crate::{
     control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
+    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
     wal_backup_partial::{self},
 };
 
@@ -211,6 +212,27 @@ where
         let s = self.start_change();
         self.finish_change(&s).await
     }
+
+    /// Make term at least as `to`. If `to` is None, increment current one. This
+    /// is not in safekeeper.rs because we want to be able to do it even if
+    /// timeline is offloaded.
+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let before = self.acceptor_state.term;
+        let mut state = self.start_change();
+        let new = match to {
+            Some(to) => max(state.acceptor_state.term, to),
+            None => state.acceptor_state.term + 1,
+        };
+        if new > state.acceptor_state.term {
+            state.acceptor_state.term = new;
+            self.finish_change(&state).await?;
+        }
+        let after = self.acceptor_state.term;
+        Ok(TimelineTermBumpResponse {
+            previous_term: before,
+            current_term: after,
+        })
+    }
 }
 
 impl<CTRL> Deref for TimelineState<CTRL>
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 6fd5de0ad6..fb98534768 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,6 +4,7 @@
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -215,6 +216,10 @@ impl StateSK {
             .get_last_log_term(self.flush_lsn())
     }
 
+    pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        self.state_mut().term_bump(to).await
+    }
+
     /// Close open WAL files to release FDs.
     fn close_wal_store(&mut self) {
         if let StateSK::Loaded(sk) = self {
@@ -854,6 +859,11 @@ impl Timeline {
         Ok(res)
     }
 
+    pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
+        let mut state = self.write_shared_state().await;
+        state.sk.term_bump(to).await
+    }
+
     /// Get the timeline guard for reading/writing WAL files.
     /// If WAL files are not present on disk (evicted), they will be automatically
     /// downloaded from remote storage. This is done in the manager task, which is
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 9bf03554e7..96c84d1616 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -50,6 +50,19 @@ class SafekeeperMetrics(Metrics):
         ).value
 
 
+@dataclass
+class TermBumpResponse:
+    previous_term: int
+    current_term: int
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> "TermBumpResponse":
+        return TermBumpResponse(
+            previous_term=d["previous_term"],
+            current_term=d["current_term"],
+        )
+
+
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
@@ -252,6 +265,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         res.raise_for_status()
         return res.json()
 
+    def term_bump(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        term: Optional[int],
+    ) -> TermBumpResponse:
+        body = {}
+        if term is not None:
+            body["term"] = term
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/term_bump",
+            json=body,
+        )
+        res.raise_for_status()
+        return TermBumpResponse.from_json(res.json())
+
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
         res = self.post(
             f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5672e836ee..50fac441c0 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2194,6 +2194,43 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
     assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
 
 
+def test_term_bump(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    # initialize safekeeper
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    http_cli = env.safekeepers[0].http_client()
+
+    # check that bump up to specific term works
+    curr_term = http_cli.timeline_status(tenant_id, timeline_id).term
+    bump_to = curr_term + 3
+    res = http_cli.term_bump(tenant_id, timeline_id, bump_to)
+    log.info(f"bump to {bump_to} res: {res}")
+    assert res.current_term >= bump_to
+
+    # check that bump to none increments current term
+    res = http_cli.term_bump(tenant_id, timeline_id, None)
+    log.info(f"bump to None res: {res}")
+    assert res.current_term > bump_to
+    assert res.current_term > res.previous_term
+
+    # check that bumping doesn't work downward
+    res = http_cli.term_bump(tenant_id, timeline_id, 2)
+    log.info(f"bump to 2 res: {res}")
+    assert res.current_term > bump_to
+    assert res.current_term == res.previous_term
+
+    # check that this doesn't kill endpoint because last WAL flush was his and
+    # thus its basebackup is still good
+    endpoint.safe_psql("insert into t values (1, 'payload')")
+
+
 # Test disables periodic pushes from safekeeper to the broker and checks that
 # pageserver can still discover safekeepers with discovery requests.
 def test_broker_discovery(neon_env_builder: NeonEnvBuilder):

From 8eab7009c11ebc03f09b2f3916e642664a4b9f88 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 16:54:45 +0300
Subject: [PATCH 12/29] safekeeper: do pid file lock before id init

---
 safekeeper/src/bin/safekeeper.rs | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 41c2d3fe08..644d5e6eaf 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -261,6 +261,15 @@ async fn main() -> anyhow::Result<()> {
     // Change into the data directory.
     std::env::set_current_dir(&workdir)?;
 
+    // Prevent running multiple safekeepers on the same directory
+    let lock_file_path = workdir.join(PID_FILE_NAME);
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("claimed pid file at {lock_file_path:?}");
+    // ensure that the lock file is held even if the main thread of the process is panics
+    // we need to release the lock file only when the current process is gone
+    std::mem::forget(lock_file);
+
     // Set or read our ID.
     let id = set_id(&workdir, args.id.map(NodeId))?;
     if args.init {
@@ -364,16 +373,6 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
-    // Prevent running multiple safekeepers on the same directory
-    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file =
-        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
-    info!("claimed pid file at {lock_file_path:?}");
-
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
-    std::mem::forget(lock_file);
-
     info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);

From c1a51416dbfadbf05cc352168ebc6fc4a83c6f59 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 Sep 2024 17:40:21 +0300
Subject: [PATCH 13/29] safekeeper: fsync filesystem on start.

We can't really rely on files contents after boot without fsync'ing
them.
---
 libs/utils/src/crashsafe.rs      | 21 +++++++++++++++++++++
 pageserver/src/bin/pageserver.rs | 19 ++-----------------
 safekeeper/src/bin/safekeeper.rs | 12 +++++++++++-
 safekeeper/src/wal_storage.rs    |  3 +--
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 756b19138c..946fedf6e5 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,9 +1,11 @@
+use std::os::fd::AsRawFd;
 use std::{
     borrow::Cow,
     fs::{self, File},
     io::{self, Write},
 };
 
+use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 /// Similar to [`std::fs::create_dir`], except we fsync the
@@ -203,6 +205,25 @@ pub fn overwrite(
     Ok(())
 }
 
+/// Syncs the filesystem for the given file descriptor.
+pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
+    // Linux guarantees durability for syncfs.
+    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
+    #[cfg(target_os = "linux")]
+    {
+        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
+    }
+    #[cfg(target_os = "macos")]
+    {
+        // macOS is not a production platform for Neon, don't even bother.
+    }
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+    {
+        compile_error!("Unsupported OS");
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 59194ab4bd..d15a0e47a4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -37,6 +37,7 @@ use pageserver::{
     virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::crashsafe::syncfs;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::{
@@ -155,23 +156,7 @@ fn main() -> anyhow::Result<()> {
         };
 
         let started = Instant::now();
-        // Linux guarantees durability for syncfs.
-        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-        #[cfg(target_os = "linux")]
-        {
-            use std::os::fd::AsRawFd;
-            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
-        }
-        #[cfg(target_os = "macos")]
-        {
-            // macOS is not a production platform for Neon, don't even bother.
-            drop(dirfd);
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            compile_error!("Unsupported OS");
-        }
-
+        syncfs(dirfd)?;
         let elapsed = started.elapsed();
         info!(
             elapsed_ms = elapsed.as_millis(),
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 644d5e6eaf..5270934f5e 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -19,7 +19,7 @@ use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use storage_broker::Uri;
 
 use tracing::*;
@@ -373,6 +373,16 @@ async fn main() -> anyhow::Result<()> {
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+    // fsync the datadir to make sure we have a consistent state on disk.
+    let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
+    let started = Instant::now();
+    utils::crashsafe::syncfs(dfd)?;
+    let elapsed = started.elapsed();
+    info!(
+        elapsed_ms = elapsed.as_millis(),
+        "syncfs data directory done"
+    );
+
     info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
     let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
         error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index c477fe5c7b..46c260901d 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -179,8 +179,7 @@ impl PhysicalStorage {
             )
         };
 
-        // TODO: do we really know that write_lsn is fully flushed to disk?
-        //      If not, maybe it's better to call fsync() here to be sure?
+        // note: this assumes we fsync'ed whole datadir on start.
         let flush_lsn = write_lsn;
 
         debug!(

From 30583cb6264653175d659d0fcb636a42e21c5877 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 6 Sep 2024 17:42:35 +0100
Subject: [PATCH 14/29] CI(label-for-external-users): add retry logic for
 unexpected errors (#8938)

## Problem

One of the PRs opened by a `neondatabase` org member got labelled as
`external` because the `gh api` call failed in the wrong way:
```
Get "https://api.github.com/orgs/neondatabase/members/<username>": dial tcp 140.82.114.5:443: i/o timeout
is-member=false
```

## Summary of changes
- Check that the error message is expected before labelling PRs
- Retry `gh api` call for 10 times in case of unexpected error messages
- Add `workflow_dispatch` trigger
---
 .../workflows/label-for-external-users.yml    | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 585d118dfb..b7cbc06a73 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -7,6 +7,11 @@ on:
   pull_request_target:
     types:
       - opened
+  workflow_dispatch:
+    inputs:
+      github-actor:
+        description: 'GitHub username. If empty, the username of the current user will be used'
+        required: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -26,12 +31,31 @@ jobs:
       id: check-user
       env:
         GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        ACTOR: ${{ inputs.github-actor || github.actor }}
       run: |
-        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
-          is_member=true
-        else
-          is_member=false
-        fi
+        expected_error="User does not exist or is not a member of the organization"
+        output_file=output.txt
+
+        for i in $(seq 1 10); do
+          if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
+
+            is_member=true
+            break
+          elif grep -q "${expected_error}" ${output_file}; then
+            is_member=false
+            break
+          elif [ $i -eq 10 ]; then
+            title="Failed to get memmbership status for ${ACTOR}"
+            message="The latest GitHub API error message: '$(cat ${output_file})'"
+            echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
+
+            exit 1
+          fi
+
+          sleep 1
+        done
 
         echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
 

From ac5815b5940c412a281c6bbab34809689a738da7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 7 Sep 2024 02:14:21 +0800
Subject: [PATCH 15/29] feat(storage-controller): add node shards api (#8896)

For control-plane managed tenants, we have the page in the admin console
that lists all tenants on a specific pageserver. But for
storage-controller managed ones, we don't have that functionality for
now.

## Summary of changes

Adds an API that lists all shards on a given node (intention + observed)

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/storcon_cli/src/main.rs         | 45 +++++++++++++++--
 libs/pageserver_api/src/controller_api.rs     | 15 ++++++
 storage_controller/src/http.rs                | 18 +++++++
 storage_controller/src/service.rs             | 49 +++++++++++++++++--
 test_runner/fixtures/common_types.py          |  8 +++
 test_runner/fixtures/neon_fixtures.py         | 26 +++++++++-
 .../regress/test_storage_controller.py        |  6 +++
 7 files changed, 157 insertions(+), 10 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2a81a3d825..651fcda8db 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -80,7 +80,10 @@ enum Command {
     /// List nodes known to the storage controller
     Nodes {},
     /// List tenants known to the storage controller
-    Tenants {},
+    Tenants {
+        /// If this field is set, it will list the tenants on a specific node
+        node_id: Option<NodeId>,
+    },
     /// Create a new tenant in the storage controller, and by extension on pageservers.
     TenantCreate {
         #[arg(long)]
@@ -403,7 +406,41 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
-        Command::Tenants {} => {
+        Command::Tenants {
+            node_id: Some(node_id),
+        } => {
+            let describe_response = storcon_client
+                .dispatch::<(), NodeShardResponse>(
+                    Method::GET,
+                    format!("control/v1/node/{node_id}/shards"),
+                    None,
+                )
+                .await?;
+            let shards = describe_response.shards;
+            let mut table = comfy_table::Table::new();
+            table.set_header([
+                "Shard",
+                "Intended Primary/Secondary",
+                "Observed Primary/Secondary",
+            ]);
+            for shard in shards {
+                table.add_row([
+                    format!("{}", shard.tenant_shard_id),
+                    match shard.is_intended_secondary {
+                        None => "".to_string(),
+                        Some(true) => "Secondary".to_string(),
+                        Some(false) => "Primary".to_string(),
+                    },
+                    match shard.is_observed_secondary {
+                        None => "".to_string(),
+                        Some(true) => "Secondary".to_string(),
+                        Some(false) => "Primary".to_string(),
+                    },
+                ]);
+            }
+            println!("{table}");
+        }
+        Command::Tenants { node_id: None } => {
             let mut resp = storcon_client
                 .dispatch::<(), Vec<TenantDescribeResponse>>(
                     Method::GET,
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 5c8dcbf571..40b7dbbbc2 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -112,6 +112,21 @@ pub struct TenantDescribeResponse {
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct NodeShardResponse {
+    pub node_id: NodeId,
+    pub shards: Vec<NodeShard>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct NodeShard {
+    pub tenant_shard_id: TenantShardId,
+    /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
+    pub is_observed_secondary: Option<bool>,
+    /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
+    pub is_intended_secondary: Option<bool>,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
     pub id: NodeId,
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5d4d0460be..96bdd5039d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -539,6 +539,17 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, node_status)
 }
 
+async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    let node_status = state.service.get_node_shards(node_id).await?;
+
+    json_response(StatusCode::OK, node_status)
+}
+
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1109,6 +1120,13 @@ pub fn make_router(
         .get("/control/v1/node/:node_id", |r| {
             named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
         })
+        .get("/control/v1/node/:node_id/shards", |r| {
+            named_request_span(
+                r,
+                handle_node_shards,
+                RequestName("control_v1_node_describe"),
+            )
+        })
         .get("/control/v1/leader", |r| {
             named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e7eae647df..44fdb474b4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -41,11 +41,11 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
-        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
-        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -4924,6 +4924,45 @@ impl Service {
             ))
     }
 
+    pub(crate) async fn get_node_shards(
+        &self,
+        node_id: NodeId,
+    ) -> Result<NodeShardResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+        let mut shards = Vec::new();
+        for (tid, tenant) in locked.tenants.iter() {
+            let is_intended_secondary = match (
+                tenant.intent.get_attached() == &Some(node_id),
+                tenant.intent.get_secondary().contains(&node_id),
+            ) {
+                (true, true) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "{} attached as primary+secondary on the same node",
+                        tid
+                    )))
+                }
+                (true, false) => Some(false),
+                (false, true) => Some(true),
+                (false, false) => None,
+            };
+            let is_observed_secondary = if let Some(ObservedStateLocation { conf: Some(conf) }) =
+                tenant.observed.locations.get(&node_id)
+            {
+                Some(conf.secondary_conf.is_some())
+            } else {
+                None
+            };
+            if is_intended_secondary.is_some() || is_observed_secondary.is_some() {
+                shards.push(NodeShard {
+                    tenant_shard_id: *tid,
+                    is_intended_secondary,
+                    is_observed_secondary,
+                });
+            }
+        }
+        Ok(NodeShardResponse { node_id, shards })
+    }
+
     pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
         self.persistence.get_leader().await
     }
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index 8eda19d1e2..064a678c96 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -140,6 +140,14 @@ class TenantId(Id):
         return self.id.hex()
 
 
+class NodeId(Id):
+    def __repr__(self) -> str:
+        return f'`NodeId("{self.id.hex()}")'
+
+    def __str__(self) -> str:
+        return self.id.hex()
+
+
 class TimelineId(Id):
     def __repr__(self) -> str:
         return f'TimelineId("{self.id.hex()}")'
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 18fbbde637..5a600dd0a1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -62,7 +62,7 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
@@ -2570,6 +2570,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return response.json()
 
+    def nodes(self):
+        """
+        :return: list of {"id": ""}
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/node",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def node_shards(self, node_id: NodeId):
+        """
+        :return: list of {"shard_id": "", "is_secondary": bool}
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/node/{node_id}/shards",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def tenant_shard_split(
         self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
     ) -> list[TenantShardId]:
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 92cd74eba5..eea05d7548 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1552,6 +1552,12 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
     literal_shard_count = 1 if shard_count is None else shard_count
     assert len(describe["shards"]) == literal_shard_count
 
+    nodes = env.storage_controller.nodes()
+    assert len(nodes) == 2
+    describe1 = env.storage_controller.node_shards(nodes[0]["id"])
+    describe2 = env.storage_controller.node_shards(nodes[1]["id"])
+    assert len(describe1["shards"]) + len(describe2["shards"]) == literal_shard_count
+
     # Check the data is still there: this implicitly proves that we recovered generation numbers
     # properly, for the timeline which was written to after a generation bump.
     for timeline, branch, expect_rows in [

From fa3fc73c1b3366a3316456bcb8fdce1bed159200 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 6 Sep 2024 21:05:18 +0200
Subject: [PATCH 16/29] Address 1.82 clippy lints (#8944)

Addresses the clippy lints of the beta 1.82 toolchain.

The `too_long_first_doc_paragraph` lint complained a lot and was
addressed separately: #8941
---
 Dockerfile.build-tools            |  2 +-
 libs/utils/src/logging.rs         |  2 +-
 rust-toolchain.toml               |  4 ++--
 scripts/coverage                  |  4 ++--
 storage_controller/src/service.rs | 10 +++++-----
 workspace_hack/Cargo.toml         |  2 ++
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index a9cbed85fb..c4209c7a12 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     export PATH="$HOME/.cargo/bin:$PATH" && \
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
-    rustup component add llvm-tools-preview rustfmt clippy && \
+    rustup component add llvm-tools rustfmt clippy && \
     cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
     cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
     cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 71af43a4da..2ea0781667 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -190,7 +190,7 @@ impl Drop for TracingPanicHookGuard {
 }
 
 /// Named symbol for our panic hook, which logs the panic.
-fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
     // following rust 1.66.1 std implementation:
     // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
     let location = info.location();
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index e78c4d6790..3c5d0b12a6 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -3,5 +3,5 @@ channel = "1.81.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
-# but we also need `llvm-tools-preview` for coverage data merges on CI
-components = ["llvm-tools-preview", "rustfmt", "clippy"]
+# but we also need `llvm-tools` for coverage data merges on CI
+components = ["llvm-tools", "rustfmt", "clippy"]
diff --git a/scripts/coverage b/scripts/coverage
index 52a69c93b9..482dc58ff6 100755
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -134,7 +134,7 @@ class LLVM:
             # Show a user-friendly warning
             raise Exception(' '.join([
                 f"It appears that you don't have `{name}` installed.",
-                "Please execute `rustup component add llvm-tools-preview`,",
+                "Please execute `rustup component add llvm-tools`,",
                 "or install it via your package manager of choice.",
                 "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
             ]))
@@ -518,7 +518,7 @@ def main() -> None:
     example = f"""
 prerequisites:
     # alternatively, install a system package for `llvm-tools`
-    rustup component add llvm-tools-preview
+    rustup component add llvm-tools
 
 self-contained example:
     {app} run make
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 44fdb474b4..6365423e10 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -451,7 +451,7 @@ struct ShardSplitParams {
 // When preparing for a shard split, we may either choose to proceed with the split,
 // or find that the work is already done and return NoOp.
 enum ShardSplitAction {
-    Split(ShardSplitParams),
+    Split(Box<ShardSplitParams>),
     NoOp(TenantShardSplitResponse),
 }
 
@@ -4186,7 +4186,7 @@ impl Service {
         let policy = policy.unwrap();
         let config = config.unwrap();
 
-        Ok(ShardSplitAction::Split(ShardSplitParams {
+        Ok(ShardSplitAction::Split(Box::new(ShardSplitParams {
             old_shard_count,
             new_shard_count: ShardCount::new(split_req.new_shard_count),
             new_stripe_size: split_req.new_stripe_size,
@@ -4194,13 +4194,13 @@ impl Service {
             policy,
             config,
             shard_ident,
-        }))
+        })))
     }
 
     async fn do_tenant_shard_split(
         &self,
         tenant_id: TenantId,
-        params: ShardSplitParams,
+        params: Box<ShardSplitParams>,
     ) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
@@ -4216,7 +4216,7 @@ impl Service {
             policy,
             config,
             shard_ident,
-        } = params;
+        } = *params;
 
         // Drop any secondary locations: pageservers do not support splitting these, and in any case the
         // end-state for a split tenant will usually be to have secondary locations on different nodes.
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 3d2fa8c214..94f4c0f22f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -8,6 +8,8 @@ version = "0.1.0"
 description = "workspace-hack package, managed by hakari"
 # You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing.
 publish = false
+edition.workspace = true
+license.workspace = true
 # The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments
 # are managed by hakari.
 

From 3dbd34aa78258928344d4de80ddcdcf46b35dfbc Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Sat, 7 Sep 2024 00:42:55 +0300
Subject: [PATCH 17/29] feat(storcon): forward gc blocking and unblocking
 (#8956)

Currently using gc blocking and unblocking with storage controller
managed pageservers is painful. Implement the API on storage controller.

Fixes: #8893
---
 pageserver/client/src/lib.rs                  | 18 ++++
 pageserver/client/src/mgmt_api.rs             | 16 ++++
 storage_controller/src/http.rs                | 39 +++++++-
 storage_controller/src/pageserver_client.rs   | 23 ++++-
 storage_controller/src/service.rs             | 54 +++++++++++-
 .../regress/test_timeline_gc_blocking.py      | 88 +++++++++++++++----
 6 files changed, 220 insertions(+), 18 deletions(-)

diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
index 4a3f4dea47..cc8db37173 100644
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1,2 +1,20 @@
 pub mod mgmt_api;
 pub mod page_service;
+
+/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
+// If file structure is per-kind not per-feature then where to put this?
+#[derive(Clone, Copy)]
+pub enum BlockUnblock {
+    Block,
+    Unblock,
+}
+
+impl std::fmt::Display for BlockUnblock {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            BlockUnblock::Block => "block",
+            BlockUnblock::Unblock => "unblock",
+        };
+        f.write_str(s)
+    }
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 737cb00835..a68f45a6d9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -12,6 +12,8 @@ use utils::{
 
 pub use reqwest::Body as ReqwestBody;
 
+use crate::BlockUnblock;
+
 pub mod util;
 
 #[derive(Debug, Clone)]
@@ -454,6 +456,20 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn timeline_block_unblock_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
+            self.mgmt_api_endpoint,
+        );
+
+        self.request(Method::POST, &uri, ()).await.map(|_| ())
+    }
+
     pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
         let uri = format!(
             "{}/v1/tenant/{}/reset",
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 96bdd5039d..a6638f5191 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -21,7 +21,7 @@ use pageserver_api::models::{
     TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
-use pageserver_client::mgmt_api;
+use pageserver_client::{mgmt_api, BlockUnblock};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -369,6 +369,23 @@ async fn handle_tenant_timeline_detach_ancestor(
     json_response(StatusCode::OK, res)
 }
 
+async fn handle_tenant_timeline_block_unblock_gc(
+    service: Arc<Service>,
+    req: Request<Body>,
+    dir: BlockUnblock,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    service
+        .tenant_timeline_block_unblock_gc(tenant_id, timeline_id, dir)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1292,6 +1309,26 @@ pub fn make_router(
                 )
             },
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/block_gc",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Block),
+                    RequestName("v1_tenant_timeline_block_unblock_gc"),
+                )
+            },
+        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/unblock_gc",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    |s, r| handle_tenant_timeline_block_unblock_gc(s, r, BlockUnblock::Unblock),
+                    RequestName("v1_tenant_timeline_block_unblock_gc"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 20770ed703..961a1f78dd 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -7,7 +7,10 @@ use pageserver_api::{
     },
     shard::TenantShardId,
 };
-use pageserver_client::mgmt_api::{Client, Result};
+use pageserver_client::{
+    mgmt_api::{Client, Result},
+    BlockUnblock,
+};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -258,6 +261,24 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn timeline_block_unblock_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<()> {
+        // measuring these makes no sense because we synchronize with the gc loop and remote
+        // storage on block_gc so there should be huge outliers
+        measured_request!(
+            "timeline_block_unblock_gc",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
+                .await
+        )
+    }
+
     pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
         measured_request!(
             "utilization",
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6365423e10..be3efaf688 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -69,7 +69,7 @@ use pageserver_api::{
         ValidateResponse, ValidateResponseTenant,
     },
 };
-use pageserver_client::mgmt_api;
+use pageserver_client::{mgmt_api, BlockUnblock};
 use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
 use utils::{
@@ -142,6 +142,7 @@ enum TenantOperations {
     AttachHook,
     TimelineArchivalConfig,
     TimelineDetachAncestor,
+    TimelineGcBlockUnblock,
 }
 
 #[derive(Clone, strum_macros::Display)]
@@ -3197,6 +3198,57 @@ impl Service {
         }).await?
     }
 
+    pub(crate) async fn tenant_timeline_block_unblock_gc(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        dir: BlockUnblock,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineGcBlockUnblock,
+        )
+        .await;
+
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            }
+
+            async fn do_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+                dir: BlockUnblock,
+            ) -> Result<(), ApiError> {
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
+                client
+                    .timeline_block_unblock_gc(tenant_shard_id, timeline_id, dir)
+                    .await
+                    .map_err(|e| passthrough_api_error(&node, e))
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            self.tenant_for_shards(targets, |tenant_shard_id, node| {
+                futures::FutureExt::boxed(do_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                    dir,
+                ))
+            })
+            .await
+        })
+        .await??;
+        Ok(())
+    }
+
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
     ///
     /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index 24de894687..ddfe9b911f 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -1,17 +1,32 @@
 import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import List, Optional
 
+import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LogCursor,
     NeonEnvBuilder,
+    NeonPageserver,
 )
 from fixtures.pageserver.utils import wait_timeline_detail_404
 
 
-def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("sharded", [True, False])
+def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool):
+    neon_env_builder.num_pageservers = 2 if sharded else 1
     env = neon_env_builder.init_start(
-        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}
+        initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"},
+        initial_tenant_shard_count=2 if sharded else None,
     )
-    ps = env.pageserver
-    http = ps.http_client()
+
+    if sharded:
+        http = env.storage_controller.pageserver_api()
+    else:
+        http = env.pageserver.http_client()
+
+    pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers)))
 
     foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant)
 
@@ -22,9 +37,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
     tenant_before = http.tenant_status(env.initial_tenant)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line)
-
-    assert ps.log_contains(gc_skipped_line, offset) is None
+    pss.assert_log_contains(gc_active_line)
+    pss.assert_log_does_not_contain(gc_skipped_line)
 
     http.timeline_block_gc(env.initial_tenant, foo_branch)
 
@@ -34,34 +48,78 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder):
     assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
-    ps.restart()
-    ps.quiesce_tenants()
+    pss.restart()
+    pss.quiesce_tenants()
 
-    _, offset = env.pageserver.assert_log_contains(init_gc_skipped, offset)
+    pss.assert_log_contains(init_gc_skipped)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
     # deletion unblocks gc
     http.timeline_delete(env.initial_tenant, foo_branch)
     wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)
 
     http.timeline_block_gc(env.initial_tenant, env.initial_timeline)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_skipped_line, offset)
+    pss.assert_log_contains(gc_skipped_line)
 
     # removing the manual block also unblocks gc
     http.timeline_unblock_gc(env.initial_tenant, env.initial_timeline)
 
     wait_for_another_gc_round()
-    _, offset = ps.assert_log_contains(gc_active_line, offset)
+    pss.assert_log_contains(gc_active_line)
 
 
 def wait_for_another_gc_round():
     time.sleep(2)
+
+
+@dataclass
+class ScrollableLog:
+    pageserver: NeonPageserver
+    offset: Optional[LogCursor]
+
+    def assert_log_contains(self, what: str):
+        msg, offset = self.pageserver.assert_log_contains(what, offset=self.offset)
+        old = self.offset
+        self.offset = offset
+        log.info(f"{old} -> {offset}: {msg}")
+
+    def assert_log_does_not_contain(self, what: str):
+        assert self.pageserver.log_contains(what) is None
+
+
+@dataclass(frozen=True)
+class ManyPageservers:
+    many: List[ScrollableLog]
+
+    def assert_log_contains(self, what: str):
+        for one in self.many:
+            one.assert_log_contains(what)
+
+    def assert_log_does_not_contain(self, what: str):
+        for one in self.many:
+            one.assert_log_does_not_contain(what)
+
+    def restart(self):
+        def do_restart(x: ScrollableLog):
+            x.pageserver.restart()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_restart, self.many)
+            rt.shutdown(wait=True)
+
+    def quiesce_tenants(self):
+        def do_quiesce(x: ScrollableLog):
+            x.pageserver.quiesce_tenants()
+
+        with ThreadPoolExecutor(max_workers=len(self.many)) as rt:
+            rt.map(do_quiesce, self.many)
+            rt.shutdown(wait=True)

From 16c200d6d9f0eaade2efe4ad0f649c3bbf23bf08 Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Sat, 7 Sep 2024 00:20:36 +0100
Subject: [PATCH 18/29] push images to prod ACR (#8940)

Used `vars` for new storing non-sensitive information, changed dev
secrets to vars as well but
didn't cleanup any secrets.

https://github.com/neondatabase/cloud/issues/16925

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/actionlint.yml               |  7 ++++
 .github/workflows/_push-to-acr.yml   | 56 ++++++++++++++++++++++++++++
 .github/workflows/build_and_test.yml | 53 +++++++++++++-------------
 3 files changed, 89 insertions(+), 27 deletions(-)
 create mode 100644 .github/workflows/_push-to-acr.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 4ad8a7b460..1b602883c5 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,6 +7,13 @@ self-hosted-runner:
     - small-arm64
     - us-east-2
 config-variables:
+  - AZURE_DEV_CLIENT_ID
+  - AZURE_DEV_REGISTRY_NAME
+  - AZURE_DEV_SUBSCRIPTION_ID
+  - AZURE_PROD_CLIENT_ID
+  - AZURE_PROD_REGISTRY_NAME
+  - AZURE_PROD_SUBSCRIPTION_ID
+  - AZURE_TENANT_ID
   - BENCHMARK_PROJECT_ID_PUB
   - BENCHMARK_PROJECT_ID_SUB
   - REMOTE_STORAGE_AZURE_CONTAINER
diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
new file mode 100644
index 0000000000..415b3d9cc6
--- /dev/null
+++ b/.github/workflows/_push-to-acr.yml
@@ -0,0 +1,56 @@
+name: Push images to ACR
+on:
+  workflow_call:
+    inputs:
+      client_id:
+        description: Client ID of Azure managed identity or Entra app
+        required: true
+        type: string
+      image_tag:
+        description: Tag for the container image
+        required: true
+        type: string
+      images:
+        description: Images to push
+        required: true
+        type: string
+      registry_name:
+        description: Name of the container registry
+        required: true
+        type: string
+      subscription_id:
+        description: Azure subscription ID
+        required: true
+        type: string
+      tenant_id:
+        description: Azure tenant ID
+        required: true
+        type: string
+
+jobs:
+  push-to-acr:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
+
+    steps:
+      - name: Azure login
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ inputs.client_id }}
+          subscription-id: ${{ inputs.subscription_id }}
+          tenant-id: ${{ inputs.tenant_id }}
+
+      - name: Login to ACR
+        run: |
+          az acr login --name=${{ inputs.registry_name }}
+
+      - name: Copy docker images to ACR ${{ inputs.registry_name }}
+        run: |
+          images='${{ inputs.images }}'
+          for image in ${images}; do
+            docker buildx imagetools create \
+              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
+                                        neondatabase/${image}:${{ inputs.image_tag }}
+          done
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ee5fd1b0c6..4bb9e5cb66 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,9 +794,6 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images:
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
@@ -823,28 +820,6 @@ jobs:
                                                neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
           done
 
-      - name: Azure login
-        if: github.ref_name == 'main'
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        if: github.ref_name == 'main'
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Copy docker images to ACR-dev
-        if: github.ref_name == 'main'
-        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
-            docker buildx imagetools create \
-              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
-                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
-          done
-
       - name: Add latest tag to images
         if: github.ref_name == 'main'
         run: |
@@ -882,6 +857,30 @@ jobs:
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
 
+  push-to-acr-dev:
+    if: github.ref_name == 'main'
+    needs: [ tag, promote-images ]
+    uses: ./.github/workflows/_push-to-acr.yml
+    with:
+      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      image_tag: ${{ needs.tag.outputs.build-tag }}
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      tenant_id: ${{ vars.AZURE_TENANT_ID }}
+
+  push-to-acr-prod:
+    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    needs: [ tag, promote-images ]
+    uses: ./.github/workflows/_push-to-acr.yml
+    with:
+      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
+      image_tag: ${{ needs.tag.outputs.build-tag }}
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
+      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
+      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
+      tenant_id: ${{ vars.AZURE_TENANT_ID }}
+
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
     runs-on: ubuntu-22.04
@@ -957,8 +956,8 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
-    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest

From 7d7d1f354b127ae27ae2e76cb0b3e9a3c8f69d90 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 7 Sep 2024 08:17:25 +0100
Subject: [PATCH 19/29] Fix rust warnings on macOS (#8955)

## Problem
```
error: unused import: `anyhow::Context`
 --> libs/utils/src/crashsafe.rs:8:5
  |
8 | use anyhow::Context;
  |     ^^^^^^^^^^^^^^^
  |
  = note: `-D unused-imports` implied by `-D warnings`
  = help: to override `-D warnings` add `#[allow(unused_imports)]`

error: unused variable: `fd`
   --> libs/utils/src/crashsafe.rs:209:15
    |
209 | pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
    |               ^^ help: if this is intentional, prefix it with an underscore: `_fd`
    |
    = note: `-D unused-variables` implied by `-D warnings`
    = help: to override `-D warnings` add `#[allow(unused_variables)]`
```

## Summary of changes
- Fix rust warnings on macOS
---
 libs/utils/src/crashsafe.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 946fedf6e5..b97c6c7a45 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -5,7 +5,6 @@ use std::{
     io::{self, Write},
 };
 
-use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 
 /// Similar to [`std::fs::create_dir`], except we fsync the
@@ -206,11 +205,13 @@ pub fn overwrite(
 }
 
 /// Syncs the filesystem for the given file descriptor.
+#[cfg_attr(target_os = "macos", allow(unused_variables))]
 pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
     // Linux guarantees durability for syncfs.
     // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
     #[cfg(target_os = "linux")]
     {
+        use anyhow::Context;
         nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
     }
     #[cfg(target_os = "macos")]

From 93ec7503e08f126e180579f02dcdb6e7a95724ba Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 7 Sep 2024 16:11:36 +0300
Subject: [PATCH 20/29] Lock the correct revision of rust-postgres crates
 (#8960)

We modified the crate in an incompatible way and upgraded to the new
version in PR #8076. However, it was reverted in #8654. The revert
reverted the Cargo.lock reference to it, but since Cargo.toml still
points to the (tip of the) 'neon' branch, every time you make any other
unrelated changes to Cargo.toml, it also tries to update the
rust-postgres crates to the tip of the 'neon' branch again, which
doesn't work.

To fix, lock the crates to the exact commit SHA that works.
---
 Cargo.lock |  8 ++++----
 Cargo.toml | 21 ++++++++++++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 634af67198..cf3031c6d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4121,7 +4121,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4134,7 +4134,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4153,7 +4153,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6409,7 +6409,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 5045ee0d4d..9203920971 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,10 +201,21 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+
+# We want to use the 'neon' branch for these, but there's currently one
+# incompatible change on the branch. See:
+#
+# - PR #8076 which contained changes that depended on the new changes in
+#   the rust-postgres crate, and
+# - PR #8654 which reverted those changes and made the code in proxy incompatible
+#   with the tip of the 'neon' branch again.
+#
+# When those proxy changes are re-applied (see PR #8747), we can switch using
+# the tip of the 'neon' branch again.
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -241,7 +252,7 @@ tonic-build = "0.9"
 [patch.crates-io]
 
 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
 # bug fixes for UUID
 parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }

From 89c5e80b3ff55f0f316aebca0bba497eba7fbec8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:47:23 +0300
Subject: [PATCH 21/29] Update toml and toml_edit crates (#8963)

Eliminates a few duplicate versions from the dependency tree.
---
 Cargo.lock                        | 57 +++++++------------------------
 Cargo.toml                        |  4 +--
 control_plane/src/pageserver.rs   | 10 +++---
 libs/remote_storage/src/config.rs |  2 +-
 libs/utils/src/toml_edit_ext.rs   |  2 +-
 pageserver/ctl/src/main.rs        |  2 +-
 pageserver/src/tenant/config.rs   |  3 +-
 workspace_hack/Cargo.toml         |  2 ++
 8 files changed, 26 insertions(+), 56 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cf3031c6d0..30c9f7e080 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1246,7 +1246,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -1360,8 +1360,8 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-util",
- "toml 0.7.4",
- "toml_edit 0.19.10",
+ "toml",
+ "toml_edit",
  "tracing",
  "url",
  "utils",
@@ -3144,7 +3144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
 dependencies = [
  "serde",
- "toml 0.8.14",
+ "toml",
 ]
 
 [[package]]
@@ -3660,7 +3660,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "utils",
  "workspace_hack",
 ]
@@ -3747,7 +3747,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "twox-hash",
  "url",
@@ -4812,7 +4812,7 @@ dependencies = [
  "tokio",
  "tokio-stream",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "utils",
 ]
@@ -5322,7 +5322,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -6520,18 +6520,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "toml"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
-dependencies = [
- "serde",
- "serde_spanned",
- "toml_datetime",
- "toml_edit 0.19.10",
-]
-
 [[package]]
 name = "toml"
 version = "0.8.14"
@@ -6541,7 +6529,7 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "toml_edit 0.22.14",
+ "toml_edit",
 ]
 
 [[package]]
@@ -6553,19 +6541,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "toml_edit"
-version = "0.19.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
-dependencies = [
- "indexmap 1.9.3",
- "serde",
- "serde_spanned",
- "toml_datetime",
- "winnow 0.4.6",
-]
-
 [[package]]
 name = "toml_edit"
 version = "0.22.14"
@@ -6576,7 +6551,7 @@ dependencies = [
  "serde",
  "serde_spanned",
  "toml_datetime",
- "winnow 0.6.13",
+ "winnow",
 ]
 
 [[package]]
@@ -6989,7 +6964,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit 0.19.10",
+ "toml_edit",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
@@ -7535,15 +7510,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
 
-[[package]]
-name = "winnow"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "winnow"
 version = "0.6.13"
@@ -7651,6 +7617,7 @@ dependencies = [
  "tokio",
  "tokio-rustls 0.24.0",
  "tokio-util",
+ "toml_edit",
  "tonic",
  "tower",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index 9203920971..107cd6cd44 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -177,8 +177,8 @@ tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
-toml = "0.7"
-toml_edit = "0.19"
+toml = "0.8"
+toml_edit = "0.22"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 31777eb7a5..33ca70af96 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -75,14 +75,14 @@ impl PageServerNode {
         }
     }
 
-    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
-        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
+        toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
     }
 
     fn pageserver_init_make_toml(
         &self,
         conf: NeonLocalInitPageserverConf,
-    ) -> anyhow::Result<toml_edit::Document> {
+    ) -> anyhow::Result<toml_edit::DocumentMut> {
         assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
 
         // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
@@ -137,9 +137,9 @@ impl PageServerNode {
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
-        let mut config_toml = toml_edit::Document::new();
+        let mut config_toml = toml_edit::DocumentMut::new();
         for fragment_str in overrides {
-            let fragment = toml_edit::Document::from_str(&fragment_str)
+            let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
                 .expect("all fragments in `overrides` are valid toml documents, this function controls that");
             for (key, item) in fragment.iter() {
                 config_toml.insert(key, item.clone());
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index f819a1572a..d0e92411da 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -185,7 +185,7 @@ mod tests {
     use super::*;
 
     fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
-        let toml = input.parse::<toml_edit::Document>().unwrap();
+        let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
         RemoteStorageConfig::from_toml(toml.as_item())
     }
 
diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs
index ab5f7bdd95..1359e27b77 100644
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
 where
     T: serde::de::DeserializeOwned,
 {
-    let document: toml_edit::Document = match item {
+    let document: toml_edit::DocumentMut = match item {
         toml_edit::Item::Table(toml) => toml.clone().into(),
         toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
             toml.clone().into_table().into()
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3b66b0c4aa..cf001ef0d5 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
                 println!("specified prefix '{}' failed validation", cmd.prefix);
                 return Ok(());
             };
-            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
+            let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
             let toml_item = toml_document
                 .get("remote_storage")
                 .expect("need remote_storage");
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 7e0344666b..547b43a399 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -452,7 +452,8 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
                     .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
             }
             toml_edit::Item::Table(table) => {
-                let deserializer = toml_edit::de::Deserializer::new(table.into());
+                let deserializer =
+                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
                 return serde_path_to_error::deserialize(deserializer)
                     .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
             }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 94f4c0f22f..411ca81032 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -83,6 +83,7 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
+toml_edit = { version = "0.22", features = ["serde"] }
 tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
@@ -126,6 +127,7 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+toml_edit = { version = "0.22", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 2d885ac07ae0207ab886fd4dda84701ae33893f1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 8 Sep 2024 21:47:57 +0300
Subject: [PATCH 22/29] Update strum (#8962)

I wanted to use some features from the newer version. The PR that needed
the new version is not ready yet (and might never be), but seems nice to
stay up in any case.
---
 Cargo.lock                        | 40 ++++++++++---------------------
 Cargo.toml                        |  6 ++---
 libs/pageserver_api/src/models.rs |  2 +-
 libs/utils/src/logging.rs         |  6 ++---
 pageserver/src/metrics.rs         |  4 ++--
 5 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 30c9f7e080..4fb3ac7223 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1189,9 +1189,9 @@ dependencies = [
 
 [[package]]
 name = "comfy-table"
-version = "6.1.4"
+version = "7.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
+checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
 dependencies = [
  "crossterm",
  "strum",
@@ -1485,25 +1485,22 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
 
 [[package]]
 name = "crossterm"
-version = "0.25.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
+checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "crossterm_winapi",
  "libc",
- "mio",
  "parking_lot 0.12.1",
- "signal-hook",
- "signal-hook-mio",
  "winapi",
 ]
 
 [[package]]
 name = "crossterm_winapi"
-version = "0.9.0"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
+checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
 dependencies = [
  "winapi",
 ]
@@ -5731,17 +5728,6 @@ dependencies = [
  "signal-hook-registry",
 ]
 
-[[package]]
-name = "signal-hook-mio"
-version = "0.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
-dependencies = [
- "libc",
- "mio",
- "signal-hook",
-]
-
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.1"
@@ -6054,21 +6040,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
 [[package]]
 name = "strum"
-version = "0.24.1"
+version = "0.26.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
 
 [[package]]
 name = "strum_macros"
-version = "0.24.3"
+version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck 0.4.1",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 107cd6cd44..40e399619d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -73,7 +73,7 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-comfy-table = "6.1"
+comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-deque = "0.8.5"
@@ -158,8 +158,8 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
-strum = "0.24"
-strum_macros = "0.24"
+strum = "0.26"
+strum_macros = "0.26"
 "subtle"  = "2.5.0"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ffe79c8350..45e84baa1f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
     serde::Serialize,
     serde::Deserialize,
     strum_macros::Display,
-    strum_macros::EnumVariantNames,
+    strum_macros::VariantNames,
     strum_macros::AsRefStr,
     strum_macros::IntoStaticStr,
 )]
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 2ea0781667..e205d60d74 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -3,11 +3,9 @@ use std::str::FromStr;
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use strum_macros::{EnumString, EnumVariantNames};
+use strum_macros::{EnumString, VariantNames};
 
-#[derive(
-    EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
-)]
+#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
     Plain,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c4011d593c..9197505876 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,7 +9,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, VariantNames};
-use strum_macros::{EnumVariantNames, IntoStaticStr};
+use strum_macros::{IntoStaticStr, VariantNames};
 use tracing::warn;
 use utils::id::TimelineId;
 
@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];
 
 // Metrics collected on operations on the storage repository.
-#[derive(Debug, EnumVariantNames, IntoStaticStr)]
+#[derive(Debug, VariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]

From c8f67eed8f0e3ed182ebe85753389ae5b1c161ea Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 9 Sep 2024 10:34:56 +0300
Subject: [PATCH 23/29] Remove TEST_SHARED_FIXTURES (#8965)

I wish it worked, but it's been broken for a long time, so let's admit
defeat and remove it.

The idea of sharing the same pageserver and safekeeper environment
between tests is still sound, and it could save a lot of time in our
CI. We should perhaps put some time into doing that, but we're better
off starting from scratch than trying to make TEST_SHARED_FIXTURES
work in its current form.
---
 test_runner/README.md                 | 12 ++---
 test_runner/fixtures/neon_fixtures.py | 68 +++------------------------
 2 files changed, 10 insertions(+), 70 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index 73aa29d4bb..647b930b26 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -18,8 +18,7 @@ Prerequisites:
 
 Regression tests are in the 'regress' directory. They can be run in
 parallel to minimize total runtime. Most regression test sets up their
-environment with its own pageservers and safekeepers (but see
-`TEST_SHARED_FIXTURES`).
+environment with its own pageservers and safekeepers.
 
 'pg_clients' contains tests for connecting with various client
 libraries. Each client test uses a Dockerfile that pulls an image that
@@ -74,7 +73,6 @@ This is used to construct full path to the postgres binaries.
 Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
-`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
 `RUST_LOG`: logging configuration to pass into Neon CLI
 
 Useful parameters and commands:
@@ -259,11 +257,9 @@ compute Postgres nodes. The connections between them can be configured to use JW
 authentication tokens, and some other configuration options can be tweaked too.
 
 The easiest way to get access to a Neon Environment is by using the `neon_simple_env`
-fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
-or make other destructive changes in that environment. Also don't assume that
-there are no tenants or branches or data in the cluster. For convenience, there is a
-branch called `empty`, though. The convention is to create a test-specific branch of
-that and load any test data there, instead of the 'main' branch.
+fixture. For convenience, there is a branch called `empty` in environments created with
+'neon_simple_env'. The convention is to create a test-specific branch of that and load any
+test data there, instead of the 'main' branch.
 
 For more complicated cases, you can build a custom Neon Environment, with the `neon_env`
 fixture:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5a600dd0a1..3047dcc4f7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -221,33 +221,6 @@ def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI:
     return NeonAPI(neon_api_key, neon_api_base_url)
 
 
-def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
-    """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
-
-    This function can be used as a scope like this:
-    @pytest.fixture(scope=shareable_scope)
-    def myfixture(...)
-       ...
-    """
-    scope: Literal["session", "function"]
-
-    if os.environ.get("TEST_SHARED_FIXTURES") is None:
-        # Create the environment in the per-test output directory
-        scope = "function"
-    elif (
-        os.environ.get("BUILD_TYPE") is not None
-        and os.environ.get("DEFAULT_PG_VERSION") is not None
-    ):
-        scope = "session"
-    else:
-        pytest.fail(
-            "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set",
-            pytrace=False,
-        )
-
-    return scope
-
-
 @pytest.fixture(scope="session")
 def worker_port_num():
     return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
@@ -1431,8 +1404,8 @@ class NeonEnv:
         return "ep-" + str(self.endpoint_counter)
 
 
-@pytest.fixture(scope=shareable_scope)
-def _shared_simple_env(
+@pytest.fixture(scope="function")
+def neon_simple_env(
     request: FixtureRequest,
     pytestconfig: Config,
     port_distributor: PortDistributor,
@@ -1450,19 +1423,13 @@ def _shared_simple_env(
     pageserver_io_buffer_alignment: Optional[int],
 ) -> Iterator[NeonEnv]:
     """
-    # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
-     is set, this is shared by all tests using `neon_simple_env`.
+    Simple Neon environment, with no authentication and no safekeepers.
 
     This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
     """
 
-    if os.environ.get("TEST_SHARED_FIXTURES") is None:
-        # Create the environment in the per-test output directory
-        repo_dir = get_test_repo_dir(request, top_output_dir)
-    else:
-        # We're running shared fixtures. Share a single directory.
-        repo_dir = top_output_dir / "shared_repo"
-        shutil.rmtree(repo_dir, ignore_errors=True)
+    # Create the environment in the per-test output directory
+    repo_dir = get_test_repo_dir(request, top_output_dir)
 
     with NeonEnvBuilder(
         top_output_dir=top_output_dir,
@@ -1489,22 +1456,6 @@ def _shared_simple_env(
 
         yield env
 
-
-@pytest.fixture(scope="function")
-def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
-    """
-    Simple Neon environment, with no authentication and no safekeepers.
-
-    If TEST_SHARED_FIXTURES environment variable is set, we reuse the same
-    environment for all tests that use 'neon_simple_env', keeping the
-    page server and safekeepers running. Any compute nodes are stopped after
-    each the test, however.
-    """
-    yield _shared_simple_env
-
-    _shared_simple_env.endpoints.stop_all()
-
-
 @pytest.fixture(scope="function")
 def neon_env_builder(
     pytestconfig: Config,
@@ -4898,14 +4849,7 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
 
 
 # This is autouse, so the test output directory always gets created, even
-# if a test doesn't put anything there. It also solves a problem with the
-# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it
-# creates the repo in the test output directory. But it cannot depend on
-# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set,
-# it has 'session' scope and cannot access fixtures with 'function'
-# scope. So it uses the get_test_output_dir() function to get the path, and
-# this fixture ensures that the directory exists.  That works because
-# 'autouse' fixtures are run before other fixtures.
+# if a test doesn't put anything there.
 #
 # NB: we request the overlay dir fixture so the fixture does its cleanups
 @pytest.fixture(scope="function", autouse=True)

From 723c0971e818848696984fd66c562c9d0cbff948 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 9 Sep 2024 10:35:12 +0300
Subject: [PATCH 24/29] Don't create 'empty' branch in neon_simple_env (#8965)

Now that we've given up hope on sharing the neon_simple_env between
tests, there's no reason to not use the 'main' branch directly.
---
 test_runner/README.md                         |  5 ++--
 test_runner/fixtures/neon_fixtures.py         |  5 +---
 .../performance/test_logical_replication.py   |  4 +--
 test_runner/regress/test_basebackup_error.py  |  3 +--
 test_runner/regress/test_clog_truncate.py     |  5 ++--
 test_runner/regress/test_compute_catalog.py   |  3 +--
 test_runner/regress/test_config.py            |  3 +--
 test_runner/regress/test_createdropdb.py      | 17 ++++---------
 test_runner/regress/test_createuser.py        |  5 ++--
 test_runner/regress/test_ddl_forwarding.py    |  3 +--
 .../regress/test_explain_with_lfc_stats.py    |  6 ++---
 test_runner/regress/test_lfc_resize.py        |  3 +--
 .../test_lfc_working_set_approximation.py     |  6 ++---
 test_runner/regress/test_local_file_cache.py  |  7 ++----
 .../regress/test_logical_replication.py       | 11 +++-----
 test_runner/regress/test_migrations.py        |  3 +--
 test_runner/regress/test_multixact.py         |  7 +++---
 test_runner/regress/test_neon_superuser.py    |  2 +-
 test_runner/regress/test_parallel_copy.py     |  3 +--
 .../regress/test_pg_query_cancellation.py     |  4 +--
 test_runner/regress/test_pg_waldump.py        |  4 +--
 test_runner/regress/test_read_validation.py   | 12 ++-------
 test_runner/regress/test_readonly_node.py     | 25 ++++++++-----------
 test_runner/regress/test_subxacts.py          |  3 +--
 test_runner/regress/test_timeline_delete.py   |  7 ++++--
 test_runner/regress/test_timeline_size.py     |  4 +--
 test_runner/regress/test_twophase.py          |  7 ++----
 test_runner/regress/test_unlogged.py          |  5 ++--
 test_runner/regress/test_vm_bits.py           |  5 ++--
 test_runner/test_broken.py                    |  3 +--
 30 files changed, 65 insertions(+), 115 deletions(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index 647b930b26..d754e60d17 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -257,9 +257,8 @@ compute Postgres nodes. The connections between them can be configured to use JW
 authentication tokens, and some other configuration options can be tweaked too.
 
 The easiest way to get access to a Neon Environment is by using the `neon_simple_env`
-fixture. For convenience, there is a branch called `empty` in environments created with
-'neon_simple_env'. The convention is to create a test-specific branch of that and load any
-test data there, instead of the 'main' branch.
+fixture. For convenience, there is a branch called `main` in environments created with
+'neon_simple_env', ready to be used in the test.
 
 For more complicated cases, you can build a custom Neon Environment, with the `neon_env`
 fixture:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3047dcc4f7..60887b9aed 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -57,7 +57,6 @@ from _pytest.fixtures import FixtureRequest
 from psycopg2.extensions import connection as PgConnection
 from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
-from typing_extensions import Literal
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
@@ -1451,11 +1450,9 @@ def neon_simple_env(
     ) as builder:
         env = builder.init_start()
 
-        # For convenience in tests, create a branch from the freshly-initialized cluster.
-        env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
-
         yield env
 
+
 @pytest.fixture(scope="function")
 def neon_env_builder(
     pytestconfig: Config,
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 077f73ac06..29a0380524 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -22,10 +22,8 @@ if TYPE_CHECKING:
 def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start("test_logical_replication")
+    endpoint = env.endpoints.create_start("main")
 
-    log.info("postgres is running on 'test_logical_replication' branch")
     pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
 
     endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py
index 170b494884..13c080ea0e 100644
--- a/test_runner/regress/test_basebackup_error.py
+++ b/test_runner/regress/test_basebackup_error.py
@@ -8,11 +8,10 @@ from fixtures.neon_fixtures import NeonEnv
 #
 def test_basebackup_error(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_basebackup_error", "empty")
     pageserver_http = env.pageserver.http_client()
 
     # Introduce failpoint
     pageserver_http.configure_failpoints(("basebackup-before-control-file", "return"))
 
     with pytest.raises(Exception, match="basebackup-before-control-file"):
-        env.endpoints.create_start("test_basebackup_error")
+        env.endpoints.create_start("main")
diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py
index 26e6e336b9..6e4880841a 100644
--- a/test_runner/regress/test_clog_truncate.py
+++ b/test_runner/regress/test_clog_truncate.py
@@ -11,7 +11,6 @@ from fixtures.utils import query_scalar
 #
 def test_clog_truncate(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_clog_truncate", "empty")
 
     # set aggressive autovacuum to make sure that truncation will happen
     config = [
@@ -24,7 +23,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
         "autovacuum_freeze_max_age=100000",
     ]
 
-    endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config)
+    endpoint = env.endpoints.create_start("main", config_lines=config)
 
     # Install extension containing function needed for test
     endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
@@ -58,7 +57,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv):
     # create new branch after clog truncation and start a compute node on it
     log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}")
     env.neon_cli.create_branch(
-        "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation
+        "test_clog_truncate_new", "main", ancestor_start_lsn=lsn_after_truncation
     )
     endpoint2 = env.endpoints.create_start("test_clog_truncate_new")
 
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index dd36190fcd..8b8c970357 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -4,9 +4,8 @@ from fixtures.neon_fixtures import NeonEnv
 
 def test_compute_catalog(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_config", "empty")
 
-    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
     client = endpoint.http_client()
 
     objects = client.dbs_and_roles()
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
index 2ef28eb94b..d8ef0b8dbd 100644
--- a/test_runner/regress/test_config.py
+++ b/test_runner/regress/test_config.py
@@ -9,10 +9,9 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 #
 def test_config(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_config", "empty")
 
     # change config
-    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
 
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py
index f741a9fc87..af643f45d7 100644
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -17,9 +17,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
     if env.pg_version == PgVersion.V14 and strategy == "wal_log":
         pytest.skip("wal_log strategy not supported on PostgreSQL 14")
 
-    env.neon_cli.create_branch("test_createdb", "empty")
-
-    endpoint = env.endpoints.create_start("test_createdb")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -33,7 +31,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
         lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create a branch
-    env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch("test_createdb2", "main", ancestor_start_lsn=lsn)
     endpoint2 = env.endpoints.create_start("test_createdb2")
 
     # Test that you can connect to the new database on both branches
@@ -62,8 +60,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
 #
 def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_dropdb", "empty")
-    endpoint = env.endpoints.create_start("test_dropdb")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         cur.execute("CREATE DATABASE foodb")
@@ -80,14 +77,10 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir):
         lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create two branches before and after database drop.
-    env.neon_cli.create_branch(
-        "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop
-    )
+    env.neon_cli.create_branch("test_before_dropdb", "main", ancestor_start_lsn=lsn_before_drop)
     endpoint_before = env.endpoints.create_start("test_before_dropdb")
 
-    env.neon_cli.create_branch(
-        "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop
-    )
+    env.neon_cli.create_branch("test_after_dropdb", "main", ancestor_start_lsn=lsn_after_drop)
     endpoint_after = env.endpoints.create_start("test_after_dropdb")
 
     # Test that database exists on the branch before drop
diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py
index 17d9824f52..d6f138e126 100644
--- a/test_runner/regress/test_createuser.py
+++ b/test_runner/regress/test_createuser.py
@@ -7,8 +7,7 @@ from fixtures.utils import query_scalar
 #
 def test_createuser(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_createuser", "empty")
-    endpoint = env.endpoints.create_start("test_createuser")
+    endpoint = env.endpoints.create_start("main")
 
     with endpoint.cursor() as cur:
         # Cause a 'relmapper' change in the original branch
@@ -19,7 +18,7 @@ def test_createuser(neon_simple_env: NeonEnv):
         lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")
 
     # Create a branch
-    env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch("test_createuser2", "main", ancestor_start_lsn=lsn)
     endpoint2 = env.endpoints.create_start("test_createuser2")
 
     # Test that you can connect to new branch as a new user
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 50da673d87..65f310c27a 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -290,9 +290,8 @@ def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str):
 # Here we test the latter. The first one is tested in test_ddl_forwarding
 def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty")
     endpoint = env.endpoints.create_start(
-        "test_ddl_forwarding_invalid_db",
+        "main",
         # Some non-existent url
         config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"],
     )
diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py
index 5231dedcda..0217c9ac7b 100644
--- a/test_runner/regress/test_explain_with_lfc_stats.py
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -10,11 +10,9 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
     cache_dir = Path(env.repo_dir) / "file_cache"
     cache_dir.mkdir(exist_ok=True)
 
-    branchname = "test_explain_with_lfc_stats"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
     endpoint = env.endpoints.create_start(
-        branchname,
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 1b2c7f808f..cb0b30d9c6 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -16,9 +16,8 @@ from fixtures.neon_fixtures import NeonEnv, PgBin
 @pytest.mark.timeout(600)
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_lfc_resize", "empty")
     endpoint = env.endpoints.create_start(
-        "test_lfc_resize",
+        "main",
         config_lines=[
             "neon.file_cache_path='file.cache'",
             "neon.max_file_cache_size=512MB",
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index 4c53e4e2fd..4a3a949d1a 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -12,11 +12,9 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
     cache_dir = Path(env.repo_dir) / "file_cache"
     cache_dir.mkdir(exist_ok=True)
 
-    branchname = "test_approximate_working_set_size"
-    env.neon_cli.create_branch(branchname, "empty")
-    log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
+    log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
     endpoint = env.endpoints.create_start(
-        branchname,
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 3c404c3b23..9c38200937 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -5,7 +5,7 @@ import threading
 import time
 from typing import List
 
-from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.utils import query_scalar
 
 
@@ -15,11 +15,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
     cache_dir = os.path.join(env.repo_dir, "file_cache")
     os.mkdir(cache_dir)
 
-    env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
-    env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
-
     endpoint = env.endpoints.create_start(
-        "test_local_file_cache_unlink",
+        "main",
         config_lines=[
             "shared_buffers='1MB'",
             f"neon.file_cache_path='{cache_dir}/file.cache'",
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index f83a833dda..15a3719e0b 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -36,10 +36,8 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_logical_replication", config_lines=["log_statement=all"]
-    )
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"])
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -185,10 +183,9 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
 
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_logical_replication", "empty")
     # set low neon.logical_replication_max_snap_files
     endpoint = env.endpoints.create_start(
-        "test_logical_replication",
+        "main",
         config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"],
     )
 
@@ -472,7 +469,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env
-    env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty")
+    env.neon_cli.create_branch("test_replication_shutdown_publisher", "main")
     pub = env.endpoints.create("test_replication_shutdown_publisher")
 
     env.neon_cli.create_branch("test_replication_shutdown_subscriber")
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index bdc5ca907e..e88e56d030 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -9,9 +9,8 @@ if TYPE_CHECKING:
 
 def test_migrations(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_migrations", "empty")
 
-    endpoint = env.endpoints.create("test_migrations")
+    endpoint = env.endpoints.create("main")
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py
index 88f7a5db59..8a00f8835f 100644
--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -14,8 +14,7 @@ from fixtures.utils import query_scalar
 #
 def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_multixact", "empty")
-    endpoint = env.endpoints.create_start("test_multixact")
+    endpoint = env.endpoints.create_start("main")
 
     cur = endpoint.connect().cursor()
     cur.execute(
@@ -73,7 +72,9 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
     assert int(next_multixact_id) > int(next_multixact_id_old)
 
     # Branch at this point
-    env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
+    env.neon_cli.create_branch(
+        "test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn
+    )
     endpoint_new = env.endpoints.create_start("test_multixact_new")
 
     next_multixact_id_new = endpoint_new.safe_psql(
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index fd31df84da..7825ec772c 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -6,7 +6,7 @@ from fixtures.utils import wait_until
 
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
+    env.neon_cli.create_branch("test_neon_superuser_publisher", "main")
     pub = env.endpoints.create("test_neon_superuser_publisher")
 
     env.neon_cli.create_branch("test_neon_superuser_subscriber")
diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py
index b33e387a66..a5037e8694 100644
--- a/test_runner/regress/test_parallel_copy.py
+++ b/test_runner/regress/test_parallel_copy.py
@@ -41,8 +41,7 @@ async def parallel_load_same_table(endpoint: Endpoint, n_parallel: int):
 # Load data into one table with COPY TO from 5 parallel connections
 def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_parallel_copy", "empty")
-    endpoint = env.endpoints.create_start("test_parallel_copy")
+    endpoint = env.endpoints.create_start("main")
 
     # Create test table
     conn = endpoint.connect()
diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py
index bad2e5865e..c6b4eff516 100644
--- a/test_runner/regress/test_pg_query_cancellation.py
+++ b/test_runner/regress/test_pg_query_cancellation.py
@@ -42,11 +42,9 @@ def test_cancellations(neon_simple_env: NeonEnv):
     ps_http = ps.http_client()
     ps_http.is_testing_enabled_or_skip()
 
-    env.neon_cli.create_branch("test_config", "empty")
-
     # We don't want to have any racy behaviour with autovacuum IOs
     ep = env.endpoints.create_start(
-        "test_config",
+        "main",
         config_lines=[
             "autovacuum = off",
             "shared_buffers = 128MB",
diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
index 8e80efd9ba..1990d69b6a 100644
--- a/test_runner/regress/test_pg_waldump.py
+++ b/test_runner/regress/test_pg_waldump.py
@@ -22,8 +22,8 @@ def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir):
 def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
     env = neon_simple_env
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty")
-    endpoint = env.endpoints.create_start("test_pg_waldump")
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main")
 
     cur = endpoint.connect().cursor()
     cur.execute(
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 1ac881553f..78798c5abf 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -15,12 +15,8 @@ extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"]
 #
 def test_read_validation(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_read_validation", "empty")
-
-    endpoint = env.endpoints.create_start(
-        "test_read_validation",
-    )
 
+    endpoint = env.endpoints.create_start("main")
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
             for e in extensions:
@@ -131,13 +127,9 @@ def test_read_validation(neon_simple_env: NeonEnv):
 
 def test_read_validation_neg(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_read_validation_neg", "empty")
-
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
-    endpoint = env.endpoints.create_start(
-        "test_read_validation_neg",
-    )
+    endpoint = env.endpoints.create_start("main")
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 368f60127e..347fc3a04d 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -22,8 +22,7 @@ from fixtures.utils import query_scalar
 #
 def test_readonly_node(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_readonly_node", "empty")
-    endpoint_main = env.endpoints.create_start("test_readonly_node")
+    endpoint_main = env.endpoints.create_start("main")
 
     env.pageserver.allowed_errors.extend(
         [
@@ -74,12 +73,12 @@ def test_readonly_node(neon_simple_env: NeonEnv):
 
     # Create first read-only node at the point where only 100 rows were inserted
     endpoint_hundred = env.endpoints.create_start(
-        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a
+        branch_name="main", endpoint_id="ep-readonly_node_hundred", lsn=lsn_a
     )
 
     # And another at the point where 200100 rows were inserted
     endpoint_more = env.endpoints.create_start(
-        branch_name="test_readonly_node", endpoint_id="ep-readonly_node_more", lsn=lsn_b
+        branch_name="main", endpoint_id="ep-readonly_node_more", lsn=lsn_b
     )
 
     # On the 'hundred' node, we should see only 100 rows
@@ -100,7 +99,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
 
     # Check creating a node at segment boundary
     endpoint = env.endpoints.create_start(
-        branch_name="test_readonly_node",
+        branch_name="main",
         endpoint_id="ep-branch_segment_boundary",
         lsn=Lsn("0/3000000"),
     )
@@ -112,7 +111,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
     with pytest.raises(Exception, match="invalid basebackup lsn"):
         # compute node startup with invalid LSN should fail
         env.endpoints.create_start(
-            branch_name="test_readonly_node",
+            branch_name="main",
             endpoint_id="ep-readonly_node_preinitdb",
             lsn=Lsn("0/42"),
         )
@@ -218,14 +217,10 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 # Similar test, but with more data, and we force checkpoints
 def test_timetravel(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    pageserver_http_client = env.pageserver.http_client()
-    env.neon_cli.create_branch("test_timetravel", "empty")
-    endpoint = env.endpoints.create_start("test_timetravel")
-
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
     client = env.pageserver.http_client()
-
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    endpoint = env.endpoints.create_start("main")
 
     lsns = []
 
@@ -249,7 +244,7 @@ def test_timetravel(neon_simple_env: NeonEnv):
         wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
 
         # run checkpoint manually to force a new layer file
-        pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id)
 
     ##### Restart pageserver
     env.endpoints.stop_all()
@@ -258,7 +253,7 @@ def test_timetravel(neon_simple_env: NeonEnv):
 
     for i, lsn in lsns:
         endpoint_old = env.endpoints.create_start(
-            branch_name="test_timetravel", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn
+            branch_name="main", endpoint_id=f"ep-old_lsn_{i}", lsn=lsn
         )
         with endpoint_old.cursor() as cur:
             assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 10cb00c780..82075bd723 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -9,8 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 # CLOG.
 def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_subxacts", "empty")
-    endpoint = env.endpoints.create_start("test_subxacts")
+    endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 328131cd08..711fcd5016 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -68,10 +68,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
 
     # construct pair of branches to validate that pageserver prohibits
     # deletion of ancestor timelines when they have child branches
-    parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty")
+    parent_timeline_id = env.neon_cli.create_branch(
+        new_branch_name="test_ancestor_branch_delete_parent", ancestor_branch_name="main"
+    )
 
     leaf_timeline_id = env.neon_cli.create_branch(
-        "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
+        new_branch_name="test_ancestor_branch_delete_branch1",
+        ancestor_branch_name="test_ancestor_branch_delete_parent",
     )
 
     timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 9bf5f8680b..f2265dd3d9 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -36,7 +36,7 @@ from fixtures.utils import get_timeline_dir_size, wait_until
 
 def test_timeline_size(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "main")
 
     client = env.pageserver.http_client()
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
@@ -68,7 +68,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
 
 def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
+    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "main")
 
     client = env.pageserver.http_client()
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index dd76689008..ea900b07b8 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -9,10 +9,7 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
 #
 def test_twophase(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_twophase", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_twophase", config_lines=["max_prepared_transactions=5"]
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"])
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -56,7 +53,7 @@ def test_twophase(neon_simple_env: NeonEnv):
     assert len(twophase_files) == 2
 
     # Create a branch with the transaction in prepared state
-    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase")
+    fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main")
 
     # Start compute on the new branch
     endpoint2 = env.endpoints.create_start(
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index 137d28b9fa..deba29536c 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -9,8 +9,7 @@ from fixtures.pg_version import PgVersion
 #
 def test_unlogged(neon_simple_env: NeonEnv):
     env = neon_simple_env
-    env.neon_cli.create_branch("test_unlogged", "empty")
-    endpoint = env.endpoints.create_start("test_unlogged")
+    endpoint = env.endpoints.create_start("main")
 
     conn = endpoint.connect()
     cur = conn.cursor()
@@ -22,7 +21,7 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur.execute("INSERT INTO iut (id) values (42);")
 
     # create another compute to fetch inital empty contents from pageserver
-    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
+    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "main")
     endpoint2 = env.endpoints.create_start("test_unlogged_basebackup")
 
     conn2 = endpoint2.connect()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 7272979c4a..3075211ada 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -13,8 +13,7 @@ from fixtures.utils import query_scalar
 def test_vm_bit_clear(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_vm_bit_clear", "empty")
-    endpoint = env.endpoints.create_start("test_vm_bit_clear")
+    endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -58,7 +57,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1")
 
     # Branch at this point, to test that later
-    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
+    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "main")
 
     # Clear the buffer cache, to force the VM page to be re-fetched from
     # the page server
diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py
index 7e8aef5a5f..d710b53528 100644
--- a/test_runner/test_broken.py
+++ b/test_runner/test_broken.py
@@ -23,8 +23,7 @@ run_broken = pytest.mark.skipif(
 def test_broken(neon_simple_env: NeonEnv, pg_bin):
     env = neon_simple_env
 
-    env.neon_cli.create_branch("test_broken", "empty")
-    env.endpoints.create_start("test_broken")
+    env.endpoints.create_start("main")
     log.info("postgres is running")
 
     log.info("THIS NEXT COMMAND WILL FAIL:")

From e158df4e86318fa3fd5ee9516f3e7ac91dd14283 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 10 Sep 2024 05:03:27 +0800
Subject: [PATCH 25/29] feat(pageserver): split delta writer automatically
 determines key range (#8850)

close https://github.com/neondatabase/neon/issues/8838

## Summary of changes

This patch modifies the split delta layer writer to avoid taking
start_key and end_key when creating/finishing the layer writer. The
start_key for the delta layers will be the first key provided to the
layer writer, and the end_key would be the `last_key.next()`. This
simplifies the delta layer writer API.

On that, the layer key hack is removed. Image layers now use the full
key range, and delta layers use the first/last key provided by the user.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs                |   9 -
 pageserver/src/tenant.rs                      |   6 +-
 .../src/tenant/storage_layer/split_writer.rs  | 158 ++++++++++++------
 pageserver/src/tenant/timeline/compaction.rs  |   7 +-
 4 files changed, 109 insertions(+), 71 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 77d744e4da..8929ccb41d 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -263,15 +263,6 @@ impl Key {
         field5: u8::MAX,
         field6: u32::MAX,
     };
-    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
-    pub const NON_L0_MAX: Key = Key {
-        field1: u8::MAX,
-        field2: u32::MAX,
-        field3: u32::MAX,
-        field4: u32::MAX,
-        field5: u8::MAX,
-        field6: u32::MAX - 1,
-    };
 
     pub fn from_hex(s: &str) -> Result<Self> {
         if s.len() != 36 {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fd2520a42e..c6f0e48101 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7091,13 +7091,13 @@ mod tests {
             vec![
                 // Image layer at GC horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: Key::MIN..Key::MAX,
                     lsn_range: Lsn(0x30)..Lsn(0x31),
                     is_delta: false
                 },
-                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
+                // The delta layer below the horizon
                 PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: get_key(3)..get_key(4),
                     lsn_range: Lsn(0x30)..Lsn(0x48),
                     is_delta: true
                 },
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 7c1ac863bf..40a6a77a50 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -188,7 +188,7 @@ impl SplitImageLayerWriter {
             .await
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    /// This function will be deprecated with #8841.
     pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
         Ok((self.generated_layers, self.inner))
     }
@@ -204,7 +204,7 @@ impl SplitImageLayerWriter {
 /// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
-    inner: DeltaLayerWriter,
+    inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
     generated_layers: Vec<SplitWriterResult>,
     conf: &'static PageServerConf,
@@ -212,7 +212,6 @@ pub struct SplitDeltaLayerWriter {
     tenant_shard_id: TenantShardId,
     lsn_range: Range<Lsn>,
     last_key_written: Key,
-    start_key: Key,
 }
 
 impl SplitDeltaLayerWriter {
@@ -220,29 +219,18 @@ impl SplitDeltaLayerWriter {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        start_key: Key,
         lsn_range: Range<Lsn>,
         target_layer_size: u64,
-        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             target_layer_size,
-            inner: DeltaLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_key,
-                lsn_range.clone(),
-                ctx,
-            )
-            .await?,
+            inner: None,
             generated_layers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
             lsn_range,
             last_key_written: Key::MIN,
-            start_key,
         })
     }
 
@@ -265,9 +253,26 @@ impl SplitDeltaLayerWriter {
         //
         // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
         // strategy. https://github.com/neondatabase/neon/issues/8837
+
+        if self.inner.is_none() {
+            self.inner = Some((
+                key,
+                DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    key,
+                    self.lsn_range.clone(),
+                    ctx,
+                )
+                .await?,
+            ));
+        }
+        let (_, inner) = self.inner.as_mut().unwrap();
+
         let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        if inner.num_keys() >= 1
+            && inner.estimated_size() + addition_size_estimation >= self.target_layer_size
         {
             if key != self.last_key_written {
                 let next_delta_writer = DeltaLayerWriter::new(
@@ -279,13 +284,13 @@ impl SplitDeltaLayerWriter {
                     ctx,
                 )
                 .await?;
-                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+                let (start_key, prev_delta_writer) =
+                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
                 let layer_key = PersistentLayerKey {
-                    key_range: self.start_key..key,
+                    key_range: start_key..key,
                     lsn_range: self.lsn_range.clone(),
                     is_delta: true,
                 };
-                self.start_key = key;
                 if discard(&layer_key).await {
                     drop(prev_delta_writer);
                     self.generated_layers
@@ -296,17 +301,18 @@ impl SplitDeltaLayerWriter {
                     self.generated_layers
                         .push(SplitWriterResult::Produced(delta_layer));
                 }
-            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
+            } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
                 anyhow::bail!(
                     "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
                     key,
-                    self.inner.estimated_size()
+                    inner.estimated_size()
                 );
             }
         }
         self.last_key_written = key;
-        self.inner.put_value(key, lsn, val, ctx).await
+        let (_, inner) = self.inner.as_mut().unwrap();
+        inner.put_value(key, lsn, val, ctx).await
     }
 
     pub async fn put_value(
@@ -325,7 +331,6 @@ impl SplitDeltaLayerWriter {
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        end_key: Key,
         discard: D,
     ) -> anyhow::Result<Vec<SplitWriterResult>>
     where
@@ -337,11 +342,15 @@ impl SplitDeltaLayerWriter {
             inner,
             ..
         } = self;
+        let Some((start_key, inner)) = inner else {
+            return Ok(generated_layers);
+        };
         if inner.num_keys() == 0 {
             return Ok(generated_layers);
         }
+        let end_key = self.last_key_written.next();
         let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
+            key_range: start_key..end_key,
             lsn_range: self.lsn_range.clone(),
             is_delta: true,
         };
@@ -360,15 +369,14 @@ impl SplitDeltaLayerWriter {
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        end_key: Key,
     ) -> anyhow::Result<Vec<SplitWriterResult>> {
-        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
+        self.finish_with_discard_fn(tline, ctx, |_| async { false })
             .await
     }
 
-    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
+    /// This function will be deprecated with #8841.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
+        Ok((self.generated_layers, self.inner.map(|x| x.1)))
     }
 }
 
@@ -432,10 +440,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -460,11 +466,22 @@ mod tests {
             )
             .await
             .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(layers.len(), 1);
+        assert_eq!(
+            layers
+                .into_iter()
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
     }
 
     #[tokio::test]
@@ -501,10 +518,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -533,10 +548,7 @@ mod tests {
             .finish(&tline, &ctx, get_key(N as u32))
             .await
             .unwrap();
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         if discard {
             for layer in image_layers {
                 layer.into_discarded_layer();
@@ -555,6 +567,14 @@ mod tests {
                 .collect_vec();
             assert_eq!(image_layers.len(), N / 512 + 1);
             assert_eq!(delta_layers.len(), N / 512 + 1);
+            assert_eq!(
+                delta_layers.first().unwrap().layer_desc().key_range.start,
+                get_key(0)
+            );
+            assert_eq!(
+                delta_layers.last().unwrap().layer_desc().key_range.end,
+                get_key(N as u32)
+            );
             for idx in 0..image_layers.len() {
                 assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
                 assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
@@ -602,10 +622,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x18)..Lsn(0x20),
             4 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -644,11 +662,35 @@ mod tests {
             )
             .await
             .unwrap();
-        let layers = delta_writer
-            .finish(&tline, &ctx, get_key(10))
-            .await
-            .unwrap();
+        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(layers.len(), 2);
+        let mut layers_iter = layers.into_iter();
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
+        assert_eq!(
+            layers_iter
+                .next()
+                .unwrap()
+                .into_resident_layer()
+                .layer_desc()
+                .key(),
+            PersistentLayerKey {
+                key_range: get_key(1)..get_key(2),
+                lsn_range: Lsn(0x18)..Lsn(0x20),
+                is_delta: true
+            }
+        );
     }
 
     #[tokio::test]
@@ -668,10 +710,8 @@ mod tests {
             tenant.conf,
             tline.timeline_id,
             tenant.tenant_shard_id,
-            get_key(0),
             Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
             4 * 1024 * 1024,
-            &ctx,
         )
         .await
         .unwrap();
@@ -689,10 +729,20 @@ mod tests {
                 .await
                 .unwrap();
         }
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
         assert_eq!(delta_layers.len(), 1);
+        let delta_layer = delta_layers
+            .into_iter()
+            .next()
+            .unwrap()
+            .into_resident_layer();
+        assert_eq!(
+            delta_layer.layer_desc().key(),
+            PersistentLayerKey {
+                key_range: get_key(0)..get_key(1),
+                lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
+                is_delta: true
+            }
+        );
     }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a87b502cd6..0b5c520ba7 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1809,7 +1809,6 @@ impl Timeline {
             .unwrap();
         // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
         // as an L0 layer.
-        let hack_end_key = Key::NON_L0_MAX;
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
         let mut downloaded_layers = Vec::new();
@@ -1855,10 +1854,8 @@ impl Timeline {
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            Key::MIN,
             lowest_retain_lsn..end_lsn,
             self.get_compaction_target_size(),
-            ctx,
         )
         .await?;
 
@@ -1965,7 +1962,7 @@ impl Timeline {
         let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
                 writer
-                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
                     .await?
             } else {
                 let (layers, _) = writer.take()?;
@@ -1978,7 +1975,7 @@ impl Timeline {
 
         let produced_delta_layers = if !dry_run {
             delta_layer_writer
-                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
+                .finish_with_discard_fn(self, ctx, discard)
                 .await?
         } else {
             let (layers, _) = delta_layer_writer.take()?;

From 982b376ea2e42d45f70c625ec91ac513f9f3a661 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 10 Sep 2024 00:04:00 +0300
Subject: [PATCH 26/29] Update parquet crate to a released version (#8961)

PR #7782 set the dependency in Cargo.toml to 'master', and locked the
version to commit that contained a specific fix, because we needed the
fix before it was included in a versioned release. The fix was later
included in parquet crate version 52.0.0, so we can now switch back to
using a released version. The latest release is 53.0.0, switch straight
to that.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 Cargo.lock                   | 10 +++++---
 Cargo.toml                   |  8 ++----
 proxy/src/context/parquet.rs | 48 ++++++++++++++++++------------------
 workspace_hack/Cargo.toml    |  4 +--
 4 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4fb3ac7223..3ca6acbc3e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3907,8 +3907,9 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "53.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
 dependencies = [
  "ahash",
  "bytes",
@@ -3927,8 +3928,9 @@ dependencies = [
 
 [[package]]
 name = "parquet_derive"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "53.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
 dependencies = [
  "parquet",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index 40e399619d..fd1d4e016c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -123,8 +123,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "51.0.0"
+parquet = { version = "53", default-features = false, features = ["zstd"] }
+parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.16"
@@ -254,10 +254,6 @@ tonic-build = "0.9"
 # Needed to get `tokio-postgres-rustls` to depend on our fork.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
 
-# bug fixes for UUID
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-
 ################# Binary contents sections
 
 [profile.release]
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index c6f83fd069..fafea2a08f 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -598,15 +598,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1312632, 3, 6000),
+                (1312621, 3, 6000),
+                (1312680, 3, 6000),
+                (1312637, 3, 6000),
+                (1312773, 3, 6000),
+                (1312610, 3, 6000),
+                (1312404, 3, 6000),
+                (1312639, 3, 6000),
+                (437848, 1, 2000)
             ]
         );
 
@@ -638,11 +638,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1208861, 5, 10000),
-                (1208592, 5, 10000),
-                (1208885, 5, 10000),
-                (1208873, 5, 10000),
-                (1209128, 5, 10000)
+                (1203465, 5, 10000),
+                (1203189, 5, 10000),
+                (1203490, 5, 10000),
+                (1203475, 5, 10000),
+                (1203729, 5, 10000)
             ]
         );
 
@@ -667,15 +667,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315874, 3, 6000),
-                (1315867, 3, 6000),
-                (1315927, 3, 6000),
-                (1315884, 3, 6000),
-                (1316014, 3, 6000),
-                (1315856, 3, 6000),
-                (1315648, 3, 6000),
-                (1315884, 3, 6000),
-                (438913, 1, 2000)
+                (1312632, 3, 6000),
+                (1312621, 3, 6000),
+                (1312680, 3, 6000),
+                (1312637, 3, 6000),
+                (1312773, 3, 6000),
+                (1312610, 3, 6000),
+                (1312404, 3, 6000),
+                (1312639, 3, 6000),
+                (437848, 1, 2000)
             ]
         );
 
@@ -712,7 +712,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
+            [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 411ca81032..140c43639e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+parquet = { version = "53", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -116,7 +116,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+parquet = { version = "53", default-features = false, features = ["zstd"] }
 proc-macro2 = { version = "1" }
 prost = { version = "0.11" }
 quote = { version = "1" }

From 842be0ba74c4c6e4245c29c3fffae4401d282c4a Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 10 Sep 2024 00:01:52 +0200
Subject: [PATCH 27/29] Specialize WalIngest on PostgreSQL version (#8904)

The current code assumes that most of this functionality is
version-independent, which is only true up to v16 - PostgreSQL 17 has a
new field in CheckPoint that we need to keep track of.

This basically removes the file-level dependency on v14, and replaces it
with switches that load the correct version dependencies where required.
---
 libs/postgres_ffi/build.rs   |   1 +
 libs/postgres_ffi/src/lib.rs | 104 ++++++++++++++++
 pageserver/src/walingest.rs  | 235 +++++++++++++++++++++--------------
 3 files changed, 249 insertions(+), 91 deletions(-)

diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index d3e3ce648f..a346390f3d 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -121,6 +121,7 @@ fn main() -> anyhow::Result<()> {
             .allowlist_type("XLogPageHeaderData")
             .allowlist_type("XLogLongPageHeaderData")
             .allowlist_var("XLOG_PAGE_MAGIC")
+            .allowlist_var("PG_MAJORVERSION_NUM")
             .allowlist_var("PG_CONTROL_FILE_SIZE")
             .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
             .allowlist_type("PageHeaderData")
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 9acb105e9b..f18e0c603b 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -44,6 +44,9 @@ macro_rules! postgres_ffi {
             // Re-export some symbols from bindings
             pub use bindings::DBState_DB_SHUTDOWNED;
             pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
+
+            pub const ZERO_CHECKPOINT: bytes::Bytes =
+                bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
         }
     };
 }
@@ -106,6 +109,107 @@ macro_rules! dispatch_pgversion {
     };
 }
 
+#[macro_export]
+macro_rules! enum_pgversion_dispatch {
+    ($name:expr, $typ:ident, $bind:ident, $code:block) => {
+        enum_pgversion_dispatch!(
+            name = $name,
+            bind = $bind,
+            typ = $typ,
+            code = $code,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        )
+    };
+    (name = $name:expr,
+     bind = $bind:ident,
+     typ = $typ:ident,
+     code = $code:block,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
+        match $name {
+            $(
+            self::$typ::$variant($bind) => {
+                use $crate::$md as pgv;
+                $code
+            }
+            ),+,
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! enum_pgversion {
+    {$name:ident, pgv :: $t:ident} => {
+        enum_pgversion!{
+            name = $name,
+            typ = $t,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        }
+    };
+    {$name:ident, pgv :: $p:ident :: $t:ident} => {
+        enum_pgversion!{
+            name = $name,
+            path = $p,
+            typ = $t,
+            pgversions = [
+                V14 : v14,
+                V15 : v15,
+                V16 : v16,
+            ]
+        }
+    };
+    {name = $name:ident,
+     typ = $t:ident,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
+        pub enum $name {
+            $($variant ( $crate::$md::$t )),+
+        }
+        impl self::$name {
+            pub fn pg_version(&self) -> u32 {
+                enum_pgversion_dispatch!(self, $name, _ign, {
+                    pgv::bindings::PG_MAJORVERSION_NUM
+                })
+            }
+        }
+        $(
+        impl Into<self::$name> for $crate::$md::$t {
+            fn into(self) -> self::$name {
+                self::$name::$variant (self)
+            }
+        }
+        )+
+    };
+    {name = $name:ident,
+     path = $p:ident,
+     typ = $t:ident,
+     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
+        pub enum $name {
+            $($variant ($crate::$md::$p::$t)),+
+        }
+        impl $name {
+            pub fn pg_version(&self) -> u32 {
+                enum_pgversion_dispatch!(self, $name, _ign, {
+                    pgv::bindings::PG_MAJORVERSION_NUM
+                })
+            }
+        }
+        $(
+        impl Into<$name> for $crate::$md::$p::$t {
+            fn into(self) -> $name {
+                $name::$variant (self)
+            }
+        }
+        )+
+    };
+}
+
 pub mod pg_constants;
 pub mod relfile_utils;
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 2d3841881b..39bc9e385f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,9 +25,7 @@ use std::time::Duration;
 use std::time::SystemTime;
 
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
-use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
-use postgres_ffi::TimestampTz;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 
 use anyhow::{bail, Context, Result};
@@ -48,16 +46,31 @@ use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
-use postgres_ffi::v14::xlog_utils::*;
-use postgres_ffi::v14::CheckPoint;
 use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
+use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 
+enum_pgversion! {CheckPoint, pgv::CheckPoint}
+
+impl CheckPoint {
+    fn encode(&self) -> Result<Bytes, SerializeError> {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() })
+    }
+
+    fn update_next_xid(&mut self, xid: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) })
+    }
+
+    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
+        enum_pgversion_dispatch!(self, CheckPoint, cp, {
+            cp.update_next_multixid(multi_xid, multi_offset)
+        })
+    }
+}
+
 pub struct WalIngest {
     shard: ShardIdentity,
-    pg_version: u32,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
     warn_ingest_lag: WarnIngestLag,
@@ -78,12 +91,16 @@ impl WalIngest {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
-        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-        trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+        let pgversion = timeline.pg_version;
+
+        let checkpoint = dispatch_pgversion!(pgversion, {
+            let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+            trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+            <pgv::CheckPoint as Into<CheckPoint>>::into(checkpoint)
+        });
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
-            pg_version: timeline.pg_version,
             checkpoint,
             checkpoint_modified: false,
             warn_ingest_lag: WarnIngestLag {
@@ -117,7 +134,7 @@ impl WalIngest {
 
         modification.set_lsn(lsn)?;
 
-        if decoded.is_dbase_create_copy(self.pg_version) {
+        if decoded.is_dbase_create_copy(pg_version) {
             // Records of this type should always be preceded by a commit(), as they
             // rely on reading data pages back from the Timeline.
             assert!(!modification.has_dirty_data_pages());
@@ -337,70 +354,67 @@ impl WalIngest {
             pg_constants::RM_XLOG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
 
-                if info == pg_constants::XLOG_NEXTOID {
-                    let next_oid = buf.get_u32_le();
-                    if self.checkpoint.nextOid != next_oid {
-                        self.checkpoint.nextOid = next_oid;
+                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                    if info == pg_constants::XLOG_NEXTOID {
+                        let next_oid = buf.get_u32_le();
+                        if cp.nextOid != next_oid {
+                            cp.nextOid = next_oid;
+                            self.checkpoint_modified = true;
+                        }
+                    } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+                        || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
+                        buf.copy_to_slice(&mut checkpoint_bytes);
+                        let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+                        trace!(
+                            "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                            xlog_checkpoint.oldestXid,
+                            cp.oldestXid
+                        );
+                        if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
+                            cp.oldestXid = xlog_checkpoint.oldestXid;
+                        }
+                        trace!(
+                            "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                            xlog_checkpoint.oldestActiveXid,
+                            cp.oldestActiveXid
+                        );
+
+                        // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                        // because at shutdown, all in-progress transactions will implicitly
+                        // end. Postgres startup code knows that, and allows hot standby to start
+                        // immediately from a shutdown checkpoint.
+                        //
+                        // In Neon, Postgres hot standby startup always behaves as if starting from
+                        // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                        // instead of overwriting self.checkpoint.oldestActiveXid with
+                        // InvalidTransactionid from the checkpoint WAL record, update it to a
+                        // proper value, knowing that there are no in-progress transactions at this
+                        // point, except for prepared transactions.
+                        //
+                        // See also the neon code changes in the InitWalRecovery() function.
+                        if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                            && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                        {
+                            let mut oldest_active_xid = cp.nextXid.value as u32;
+                            for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                                if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                    oldest_active_xid = xid;
+                                }
+                            }
+                            cp.oldestActiveXid = oldest_active_xid;
+                        } else {
+                            cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                        }
+
+                        // Write a new checkpoint key-value pair on every checkpoint record, even
+                        // if nothing really changed. Not strictly required, but it seems nice to
+                        // have some trace of the checkpoint records in the layer files at the same
+                        // LSNs.
                         self.checkpoint_modified = true;
                     }
-                } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-                    || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                {
-                    let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
-                    buf.copy_to_slice(&mut checkpoint_bytes);
-                    let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-                    trace!(
-                        "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                        xlog_checkpoint.oldestXid,
-                        self.checkpoint.oldestXid
-                    );
-                    if (self
-                        .checkpoint
-                        .oldestXid
-                        .wrapping_sub(xlog_checkpoint.oldestXid) as i32)
-                        < 0
-                    {
-                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                    }
-                    trace!(
-                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
-                        xlog_checkpoint.oldestActiveXid,
-                        self.checkpoint.oldestActiveXid
-                    );
-
-                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
-                    // because at shutdown, all in-progress transactions will implicitly
-                    // end. Postgres startup code knows that, and allows hot standby to start
-                    // immediately from a shutdown checkpoint.
-                    //
-                    // In Neon, Postgres hot standby startup always behaves as if starting from
-                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
-                    // instead of overwriting self.checkpoint.oldestActiveXid with
-                    // InvalidTransactionid from the checkpoint WAL record, update it to a
-                    // proper value, knowing that there are no in-progress transactions at this
-                    // point, except for prepared transactions.
-                    //
-                    // See also the neon code changes in the InitWalRecovery() function.
-                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
-                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                    {
-                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
-                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                oldest_active_xid = xid;
-                            }
-                        }
-                        self.checkpoint.oldestActiveXid = oldest_active_xid;
-                    } else {
-                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
-                    }
-
-                    // Write a new checkpoint key-value pair on every checkpoint record, even
-                    // if nothing really changed. Not strictly required, but it seems nice to
-                    // have some trace of the checkpoint records in the layer files at the same
-                    // LSNs.
-                    self.checkpoint_modified = true;
-                }
+                });
             }
             pg_constants::RM_LOGICALMSG_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -424,7 +438,11 @@ impl WalIngest {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
                 if info == pg_constants::XLOG_RUNNING_XACTS {
                     let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
-                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+
+                    enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                        cp.oldestActiveXid = xlrec.oldest_running_xid;
+                    });
+
                     self.checkpoint_modified = true;
                 }
             }
@@ -539,7 +557,7 @@ impl WalIngest {
             && blk.has_image
             && decoded.xl_rmid == pg_constants::RM_XLOG_ID
             && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
             && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
             // do not materialize null pages because them most likely be soon replaced with real data
@@ -1242,12 +1260,17 @@ impl WalIngest {
     fn warn_on_ingest_lag(
         &mut self,
         conf: &crate::config::PageServerConf,
-        wal_timestmap: TimestampTz,
+        wal_timestamp: TimestampTz,
     ) {
         debug_assert_current_span_has_tenant_and_timeline_id();
         let now = SystemTime::now();
         let rate_limits = &mut self.warn_ingest_lag;
-        match try_from_pg_timestamp(wal_timestmap) {
+
+        let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, {
+            pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp)
+        });
+
+        match ts {
             Ok(ts) => {
                 match now.duration_since(ts) {
                     Ok(lag) => {
@@ -1257,7 +1280,7 @@ impl WalIngest {
                                 warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                             })
                         }
-                    },
+                    }
                     Err(e) => {
                         let delta_t = e.duration();
                         // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
@@ -1271,7 +1294,6 @@ impl WalIngest {
                         }
                     }
                 };
-
             }
             Err(error) => {
                 rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
@@ -1379,14 +1401,17 @@ impl WalIngest {
         // truncated, but a checkpoint record with the updated values isn't written until
         // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
         // so we keep the oldestXid and oldestXidDB up-to-date.
-        self.checkpoint.oldestXid = xlrec.oldest_xid;
-        self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
+        enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+            cp.oldestXid = xlrec.oldest_xid;
+            cp.oldestXidDB = xlrec.oldest_xid_db;
+        });
         self.checkpoint_modified = true;
 
         // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
 
         let latest_page_number =
-            self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+            enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32
+                / pg_constants::CLOG_XACTS_PER_PAGE;
 
         // Now delete all segments containing pages between xlrec.pageno
         // and latest_page_number.
@@ -1394,7 +1419,9 @@ impl WalIngest {
         // First, make an important safety check:
         // the current endpoint page must not be eligible for removal.
         // See SimpleLruTruncate() in slru.c
-        if clogpage_precedes(latest_page_number, xlrec.pageno) {
+        if dispatch_pgversion!(modification.tline.pg_version, {
+            pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno)
+        }) {
             info!("could not truncate directory pg_xact apparent wraparound");
             return Ok(());
         }
@@ -1411,7 +1438,12 @@ impl WalIngest {
             .await?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
-            if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
+
+            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno)
+            });
+
+            if may_delete {
                 modification
                     .drop_slru_segment(SlruKind::Clog, segno, ctx)
                     .await?;
@@ -1530,14 +1562,23 @@ impl WalIngest {
         xlrec: &XlMultiXactTruncate,
         ctx: &RequestContext,
     ) -> Result<()> {
-        self.checkpoint.oldestMulti = xlrec.end_trunc_off;
-        self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
+        let (maxsegment, startsegment, endsegment) =
+            enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                cp.oldestMulti = xlrec.end_trunc_off;
+                cp.oldestMultiDB = xlrec.oldest_multi_db;
+                let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(
+                    pg_constants::MAX_MULTIXACT_OFFSET,
+                );
+                let startsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb);
+                let endsegment: i32 =
+                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb);
+                (maxsegment, startsegment, endsegment)
+            });
+
         self.checkpoint_modified = true;
 
         // PerformMembersTruncation
-        let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
-        let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb);
-        let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb);
         let mut segment: i32 = startsegment;
 
         // Delete all the segments except the last one. The last segment can still
@@ -1811,11 +1852,23 @@ mod tests {
         // TODO
     }
 
-    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
+    #[tokio::test]
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+        for i in 14..=16 {
+            dispatch_pgversion!(i, {
+                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
+            });
+        }
+
+        Ok(())
+    }
 
     async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
-        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
+        m.put_checkpoint(dispatch_pgversion!(
+            tline.pg_version,
+            pgv::ZERO_CHECKPOINT.clone()
+        ))?;
         m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
         m.commit(ctx).await?;
         let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

From 97582178cb576f8b68acc53535adf8918d7dbd94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 10 Sep 2024 02:40:00 +0200
Subject: [PATCH 28/29] Remove async_trait from the Handler trait (#8958)

Newest attempt to remove `async_trait` from the Handler trait.

Earlier attempts were in #7301 and #8296 .
---
 libs/postgres_backend/src/lib.rs             |  5 +-
 libs/postgres_backend/tests/simple_select.rs |  1 -
 pageserver/src/page_service.rs               |  1 -
 proxy/src/console/mgmt.rs                    |  2 +-
 safekeeper/src/handler.rs                    | 80 ++++++++++----------
 5 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 600f1d728c..8ea4b93fb1 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -81,17 +81,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
     )
 }
 
-#[async_trait::async_trait]
 pub trait Handler<IO> {
     /// Handle single query.
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care). It will also flush out the output buffer.
-    async fn process_query(
+    fn process_query(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         query_string: &str,
-    ) -> Result<(), QueryError>;
+    ) -> impl Future<Output = Result<(), QueryError>>;
 
     /// Called on startup packet receival, allows to process params.
     ///
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 7ec85f0dbe..900083ea7f 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -23,7 +23,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {
 
 struct TestHandler {}
 
-#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
     // return single col 'hey' for any query
     async fn process_query(
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 39c6a6fb74..9261b7481d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1199,7 +1199,6 @@ impl PageServerHandler {
     }
 }
 
-#[async_trait::async_trait]
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
     IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 2ed4f5f206..ee5f83ee76 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -78,7 +78,7 @@ pub(crate) type ComputeReady = DatabaseInfo;
 
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
-#[async_trait::async_trait]
+
 impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
     async fn process_query(
         &mut self,
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 2c519433ef..3f00b69cde 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,6 +2,7 @@
 //! protocol commands.
 
 use anyhow::Context;
+use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -95,7 +96,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
     }
 }
 
-#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
     for SafekeeperPostgresHandler
 {
@@ -197,49 +197,51 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
         Ok(())
     }
 
-    async fn process_query(
+    fn process_query(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         query_string: &str,
-    ) -> Result<(), QueryError> {
-        if query_string
-            .to_ascii_lowercase()
-            .starts_with("set datestyle to ")
-        {
-            // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-            return Ok(());
-        }
-
-        let cmd = parse_cmd(query_string)?;
-        let cmd_str = cmd_to_string(&cmd);
-
-        let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
-
-        info!("got query {:?}", query_string);
-
-        let tenant_id = self.tenant_id.context("tenantid is required")?;
-        let timeline_id = self.timeline_id.context("timelineid is required")?;
-        self.check_permission(Some(tenant_id))?;
-        self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
-
-        match cmd {
-            SafekeeperPostgresCommand::StartWalPush => {
-                self.handle_start_wal_push(pgb)
-                    .instrument(info_span!("WAL receiver"))
-                    .await
+    ) -> impl Future<Output = Result<(), QueryError>> {
+        Box::pin(async move {
+            if query_string
+                .to_ascii_lowercase()
+                .starts_with("set datestyle to ")
+            {
+                // important for debug because psycopg2 executes "SET datestyle TO 'ISO'" on connect
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                return Ok(());
             }
-            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
-                self.handle_start_replication(pgb, start_lsn, term)
-                    .instrument(info_span!("WAL sender"))
-                    .await
+
+            let cmd = parse_cmd(query_string)?;
+            let cmd_str = cmd_to_string(&cmd);
+
+            let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
+
+            info!("got query {:?}", query_string);
+
+            let tenant_id = self.tenant_id.context("tenantid is required")?;
+            let timeline_id = self.timeline_id.context("timelineid is required")?;
+            self.check_permission(Some(tenant_id))?;
+            self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
+
+            match cmd {
+                SafekeeperPostgresCommand::StartWalPush => {
+                    self.handle_start_wal_push(pgb)
+                        .instrument(info_span!("WAL receiver"))
+                        .await
+                }
+                SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
+                    self.handle_start_replication(pgb, start_lsn, term)
+                        .instrument(info_span!("WAL sender"))
+                        .await
+                }
+                SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
+                SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
+                SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
+                    handle_json_ctrl(self, pgb, cmd).await
+                }
             }
-            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
-            SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
-            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
-                handle_json_ctrl(self, pgb, cmd).await
-            }
-        }
+        })
     }
 }
 

From 26b5fcdc5077e5f4051f27c2e2d8f82ac5038acb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 10 Sep 2024 12:54:25 +0100
Subject: [PATCH 29/29] reinstate write-path key check (#8973)

## Problem

In https://github.com/neondatabase/neon/pull/8621, validation of keys
during ingest was removed because the places where we actually store
keys are now past the point where we have already converted them to
CompactKey (i128) representation.

## Summary of changes

Reinstate validation at an earlier stage in ingest. This doesn't cover
literally every place we write a key, but it covers most cases where
we're trusting postgres to give us a valid key (i.e. one that doesn't
try and use a custom spacenode).
---
 pageserver/src/pgdatadir_mapping.rs | 49 ++++++++++++++++++++++++-----
 pageserver/src/walingest.rs         |  8 ++---
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 808d4b666e..6dd8851b13 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1205,6 +1205,13 @@ impl<'a> DatadirModification<'a> {
         img: Bytes,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
     }
@@ -1216,14 +1223,34 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
-        self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver at {}",
+                key
+            );
+        }
+        self.put(key, Value::Image(img));
         Ok(())
     }
 
-    pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
-        self.pending_zero_data_pages
-            .insert(rel_block_to_key(rel, blknum).to_compact());
+    pub(crate) fn put_rel_page_image_zero(
+        &mut self,
+        rel: RelTag,
+        blknum: BlockNumber,
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+        let key = rel_block_to_key(rel, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
         self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
     }
 
     pub(crate) fn put_slru_page_image_zero(
@@ -1231,10 +1258,18 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) {
-        self.pending_zero_data_pages
-            .insert(slru_block_to_key(kind, segno, blknum).to_compact());
+    ) -> anyhow::Result<()> {
+        let key = slru_block_to_key(kind, segno, blknum);
+        if !key.is_valid_key_on_write_path() {
+            anyhow::bail!(
+                "the request contains data not supported by pageserver: {} @ {}",
+                key,
+                self.lsn
+            );
+        }
+        self.pending_zero_data_pages.insert(key.to_compact());
         self.pending_bytes += ZERO_PAGE.len();
+        Ok(())
     }
 
     /// Call this at the end of each WAL record.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 39bc9e385f..6e15ad81c3 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1222,7 +1222,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                 // Tail of last remaining FSM page has to be zeroed.
                 // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                 fsm_physical_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1244,7 +1244,7 @@ impl WalIngest {
             if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                 // Tail of last remaining vm page has to be zeroed.
                 // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, vm_page_no);
+                modification.put_rel_page_image_zero(rel, vm_page_no)?;
                 vm_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1737,7 +1737,7 @@ impl WalIngest {
                     continue;
                 }
 
-                modification.put_rel_page_image_zero(rel, gap_blknum);
+                modification.put_rel_page_image_zero(rel, gap_blknum)?;
             }
         }
         Ok(())
@@ -1803,7 +1803,7 @@ impl WalIngest {
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image_zero(kind, segno, gap_blknum);
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum)?;
             }
         }
         Ok(())