From 5876c441abc973acca60882192ad46333c075abd Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 10 Sep 2024 16:46:32 +0100
Subject: [PATCH 01/49] Grant access to pg_show_replication_origin_status for
 neon_superuser

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 ...ant_pg_show_replication_origin_status_to_neon_superuser.sql | 1 +
 compute_tools/src/spec.rs                                      | 3 +++
 test_runner/regress/test_migrations.py                         | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql

diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
new file mode 100644
index 0000000000..425ed8cd3d
--- /dev/null
+++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 6a87263821..aa9405d28d 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -793,6 +793,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!(
             "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
         ),
+        include_str!(
+            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
+        ),
     ];
 
     MigrationRunner::new(client, &migrations).run_migrations()?;
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index e88e56d030..7211619a99 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -14,7 +14,7 @@ def test_migrations(neon_simple_env: NeonEnv):
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
-    num_migrations = 10
+    num_migrations = 11
     endpoint.wait_for_migrations(num_migrations=num_migrations)
 
     with endpoint.cursor() as cur:

From c8bedca5821694633d140a1da2baa4fa9c7dea0c Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 16 Sep 2024 18:06:31 +0200
Subject: [PATCH 02/49] Fix PG17's extension modifications (#9010)

This also reduces the GRANT statements to one per created _reset
function
---
 Dockerfile.compute-node | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 6e2510fe60..6bf6fb650f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -55,22 +55,27 @@ RUN cd postgres && \
     # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
     # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
     # so we do it here.
-    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
-    # the first loop is for pg_stat_statement extension version <= 1.6
     for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
         filename=$(basename "$file"); \
-        if echo "$old_list" | grep -q -F "$filename"; then \
+        # Note that there are no downgrade scripts for pg_stat_statements, so we \
+        # don't have to modify any downgrade paths or (much) older versions: we only \
+        # have to make sure every creation of the pg_stat_statements_reset function \
+        # also adds execute permissions to the neon_superuser.
+        case $filename in \
+          pg_stat_statements--1.4.sql) \
+            # pg_stat_statements_reset is first created with 1.4
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-        fi; \
-    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7,
-    # where pg_stat_statement_reset() got 3 additional arguments
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if ! echo "$old_list" | grep -q -F "$filename"; then \
+            ;; \
+          pg_stat_statements--1.6--1.7.sql) \
+            # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-        fi; \
-    done
+            ;; \
+          pg_stat_statements--1.10--1.11.sql) \
+            # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
+            ;; \
+        esac; \
+    done;
 
 #########################################################################################
 #

From 2bbb4d3e1c4d70d5bcdc972eb2b9863d1073338a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 16 Sep 2024 21:45:19 +0300
Subject: [PATCH 03/49] Remove misc unused code (#9014)

---
 libs/postgres_ffi/src/xlog_utils.rs |  9 +-------
 libs/utils/src/accum.rs             | 33 -----------------------------
 libs/utils/src/id.rs                | 10 ---------
 libs/utils/src/lib.rs               |  7 ------
 libs/utils/src/lsn.rs               |  9 --------
 libs/utils/src/nonblock.rs          | 17 ---------------
 libs/utils/src/shutdown.rs          |  7 ------
 7 files changed, 1 insertion(+), 91 deletions(-)
 delete mode 100644 libs/utils/src/accum.rs
 delete mode 100644 libs/utils/src/nonblock.rs
 delete mode 100644 libs/utils/src/shutdown.rs

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 0cfd56962e..1873734753 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -30,7 +30,7 @@ use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
 use std::io::SeekFrom;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
@@ -260,13 +260,6 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
     }
 }
 
-pub fn main() {
-    let mut data_dir = PathBuf::new();
-    data_dir.push(".");
-    let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
-    println!("wal_end={:?}", wal_end);
-}
-
 impl XLogRecord {
     pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
         use utils::bin_ser::LeSer;
diff --git a/libs/utils/src/accum.rs b/libs/utils/src/accum.rs
deleted file mode 100644
index 0fb0190a92..0000000000
--- a/libs/utils/src/accum.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
-/// feed the accumulated values by calling the 'accum' function, instead of having an
-/// iterator.
-///
-/// For example, to calculate the smallest value among some integers:
-///
-/// ```
-/// use utils::accum::Accum;
-///
-/// let values = [1, 2, 3];
-///
-/// let mut min_value: Accum<u32> = Accum(None);
-/// for new_value in &values {
-///     min_value.accum(std::cmp::min, *new_value);
-/// }
-///
-/// assert_eq!(min_value.0.unwrap(), 1);
-/// ```
-pub struct Accum<T>(pub Option<T>);
-impl<T: Copy> Accum<T> {
-    pub fn accum<F>(&mut self, func: F, new_value: T)
-    where
-        F: FnOnce(T, T) -> T,
-    {
-        // If there is no previous value, just store the new value.
-        // Otherwise call the function to decide which one to keep.
-        self.0 = Some(if let Some(accum) = self.0 {
-            func(accum, new_value)
-        } else {
-            new_value
-        });
-    }
-}
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index 2cda899b15..eb91839504 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -88,12 +88,6 @@ impl<'de> Deserialize<'de> for Id {
 }
 
 impl Id {
-    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
-        let mut arr = [0u8; 16];
-        buf.copy_to_slice(&mut arr);
-        Id::from(arr)
-    }
-
     pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
         if src.len() != 16 {
             return Err(IdError::SliceParseError(src.len()));
@@ -179,10 +173,6 @@ impl fmt::Debug for Id {
 macro_rules! id_newtype {
     ($t:ident) => {
         impl $t {
-            pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
-                $t(Id::get_from_buf(buf))
-            }
-
             pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
                 Ok($t(Id::from_slice(src)?))
             }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 218dd468b1..03fb36caf8 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -43,16 +43,9 @@ pub mod logging;
 pub mod lock_file;
 pub mod pid_file;
 
-// Misc
-pub mod accum;
-pub mod shutdown;
-
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;
 
-// Utility for putting a raw file descriptor into non-blocking mode
-pub mod nonblock;
-
 // Default signal handling
 pub mod sentry_init;
 pub mod signals;
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 1aebe91428..06d5c27ebf 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,6 +1,5 @@
 #![warn(missing_docs)]
 
-use camino::Utf8Path;
 use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
@@ -145,14 +144,6 @@ impl Lsn {
         i128::from(self.0) - i128::from(other)
     }
 
-    /// Parse an LSN from a filename in the form `0000000000000000`
-    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
-    where
-        F: AsRef<Utf8Path>,
-    {
-        Lsn::from_hex(filename.as_ref().as_str())
-    }
-
     /// Parse an LSN from a string in the form `0000000000000000`
     pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
     where
diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs
deleted file mode 100644
index 05e2e3af4c..0000000000
--- a/libs/utils/src/nonblock.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
-use std::os::unix::io::RawFd;
-
-/// Put a file descriptor into non-blocking mode
-pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
-    let bits = fcntl(fd, F_GETFL)?;
-
-    // If F_GETFL returns some unknown bits, they should be valid
-    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
-    // would effectively clear them, which is not what we want.
-    let mut flags = OFlag::from_bits_retain(bits);
-    flags |= OFlag::O_NONBLOCK;
-
-    fcntl(fd, F_SETFL(flags))?;
-
-    Ok(())
-}
diff --git a/libs/utils/src/shutdown.rs b/libs/utils/src/shutdown.rs
deleted file mode 100644
index cb5a44d664..0000000000
--- a/libs/utils/src/shutdown.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-/// Immediately terminate the calling process without calling
-/// atexit callbacks, C runtime destructors etc. We mainly use
-/// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) -> ! {
-    // SAFETY: exiting is safe, the ffi is not safe
-    unsafe { nix::libc::_exit(code as _) };
-}

From 5e16c7bb0b484cc760f4ddb3c0a866157760825e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 10 Sep 2024 15:37:08 +0100
Subject: [PATCH 04/49] Generate pgbench data on the server for most tests

This should generally be faster when running tests, especially those
that run with higher scales.

Ignoring test_lfc_resize since it seems like we are hitting a query
timeout for some reason that I have yet to investigate. A little bit of
improvemnt is better than none.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/performance/test_branch_creation.py    |  2 +-
 test_runner/performance/test_branching.py          |  2 +-
 .../performance/test_logical_replication.py        | 14 +++++++-------
 .../performance/test_physical_replication.py       |  4 ++--
 test_runner/regress/test_branching.py              |  2 +-
 test_runner/regress/test_disk_usage_eviction.py    |  2 +-
 test_runner/regress/test_hot_standby.py            |  2 +-
 test_runner/regress/test_pageserver_reconnect.py   |  2 +-
 .../test_pageserver_restarts_under_workload.py     |  2 +-
 .../regress/test_threshold_based_eviction.py       |  2 +-
 10 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index b3866f1813..f1ab7876f9 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -107,7 +107,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     env.neon_cli.create_branch("b0")
 
     endpoint = env.endpoints.create_start("b0")
-    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
+    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()])
 
     branch_creation_durations = []
 
diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py
index 667d1a4c4a..f8d39487f2 100644
--- a/test_runner/performance/test_branching.py
+++ b/test_runner/performance/test_branching.py
@@ -43,7 +43,7 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
 
     env.neon_cli.create_branch("root")
     endpoint_root = env.endpoints.create_start("root")
-    pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", endpoint_root.connstr(), "-s10"])
 
     fork_at_current_lsn(env, endpoint_root, "child", "root")
 
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 29a0380524..dbf94a2cf5 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -24,13 +24,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
 
     endpoint = env.endpoints.create_start("main")
 
-    pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()])
 
     endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
 
     # now start subscriber
     vanilla_pg.start()
-    pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", vanilla_pg.connstr()])
 
     vanilla_pg.safe_psql("truncate table pgbench_accounts")
     vanilla_pg.safe_psql("truncate table pgbench_history")
@@ -99,9 +99,9 @@ def test_subscriber_lag(
     sub_connstr = benchmark_project_sub.connstr
 
     if benchmark_project_pub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env)
     if benchmark_project_sub.is_new:
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
@@ -193,8 +193,8 @@ def test_publisher_restart(
     pub_connstr = benchmark_project_pub.connstr
     sub_connstr = benchmark_project_sub.connstr
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=sub_env)
 
     pub_conn = psycopg2.connect(pub_connstr)
     sub_conn = psycopg2.connect(sub_connstr)
@@ -288,7 +288,7 @@ def test_snap_files(
             is_super = cur.fetchall()[0][0]
             assert is_super, "This benchmark won't work if we don't have superuser"
 
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)
 
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 7e16197211..49b1176d34 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -85,7 +85,7 @@ def test_ro_replica_lag(
             endpoint_id=replica["endpoint"]["id"],
         )["uri"]
 
-        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=master_env)
 
         master_workload = pg_bin.run_nonblocking(
             ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
@@ -212,7 +212,7 @@ def test_replication_start_stop(
         for i in range(num_replicas):
             replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
 
-        pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)
 
         # Sync replicas
         with psycopg2.connect(master_connstr) as conn_master:
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index fc74707639..1729e2fc98 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -52,7 +52,7 @@ def test_branching_with_pgbench(
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
 
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", "-T15", connstr])
 
     env.neon_cli.create_branch("b0", tenant_id=tenant)
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 85616c3fe2..1fec8b3f18 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -291,7 +291,7 @@ def pgbench_init_tenant(
     )
 
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", endpoint.connstr()])
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     return (tenant_id, timeline_id)
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index ae63136abb..d94704012f 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -199,7 +199,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
 def run_pgbench(connstr: str, pg_bin: PgBin):
     log.info(f"Start a pgbench workload on pg {connstr}")
     # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
+    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr])
     log.info("pgbench init done")
     pg_bin.run_capture(["pgbench", "-T60", connstr])
 
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
index 37ff923632..ada6da98ff 100644
--- a/test_runner/regress/test_pageserver_reconnect.py
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -22,7 +22,7 @@ def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py
index 65569f3bac..9bb9b373ad 100644
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -19,7 +19,7 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 840c7159ad..094dd20529 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -106,7 +106,7 @@ def test_threshold_based_eviction(
 
     # create a bunch of layers
     with env.endpoints.create_start("main", tenant_id=tenant_id) as pg:
-        pg_bin.run(["pgbench", "-i", "-s", "3", pg.connstr()])
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s", "3", pg.connstr()])
         last_flush_lsn_upload(env, pg, tenant_id, timeline_id)
     # wrap up and shutdown safekeepers so that no more layers will be created after the final checkpoint
     for sk in env.safekeepers:

From 3a52e356c18440cb3b685d2cf1848fd3a3c86c67 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:46:58 +0300
Subject: [PATCH 05/49] Remove unused function (#9018)

---
 test_runner/fixtures/neon_fixtures.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 50284a3f5a..5f7a32782c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -182,25 +182,6 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
     yield output_dir
 
 
-@pytest.fixture(scope="function")
-def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
-    versioned_dir = pg_distrib_dir / pg_version.v_prefixed
-
-    psql_bin_path = versioned_dir / "bin/psql"
-    postgres_bin_path = versioned_dir / "bin/postgres"
-
-    if os.getenv("REMOTE_ENV"):
-        # When testing against a remote server, we only need the client binary.
-        if not psql_bin_path.exists():
-            raise Exception(f"psql not found at '{psql_bin_path}'")
-    else:
-        if not postgres_bin_path.exists():
-            raise Exception(f"postgres not found at '{postgres_bin_path}'")
-
-    log.info(f"versioned_pg_distrib_dir is {versioned_dir}")
-    yield versioned_dir
-
-
 @pytest.fixture(scope="session")
 def neon_api_key() -> str:
     api_key = os.getenv("NEON_API_KEY")

From fec9321fc04434f84f86d166a6d89c4421110c77 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 02:23:49 +0300
Subject: [PATCH 06/49] Use Path type in a few more places in neon_fixtures.py
 (#9018)

This is in preparation of replacing neon_fixtures.get_dir_size with
neon_fixtures.utils.get_dir_size() in next commit.
---
 test_runner/fixtures/compare_fixtures.py |  5 ++--
 test_runner/fixtures/neon_fixtures.py    | 32 ++++++++++++------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 7c4a8db36f..770b32b11e 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
 from contextlib import _GeneratorContextManager, contextmanager
 
 # Type-related stuff
+from pathlib import Path
 from typing import Dict, Iterator, List
 
 import pytest
@@ -229,11 +230,11 @@ class VanillaCompare(PgCompare):
         pass  # TODO find something
 
     def report_size(self):
-        data_size = self.pg.get_subdir_size("base")
+        data_size = self.pg.get_subdir_size(Path("base"))
         self.zenbenchmark.record(
             "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
-        wal_size = self.pg.get_subdir_size("pg_wal")
+        wal_size = self.pg.get_subdir_size(Path("pg_wal"))
         self.zenbenchmark.record(
             "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
         )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5f7a32782c..90a351cdb3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -224,7 +224,7 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
     return BASE_PORT + worker_seq_no * worker_port_num
 
 
-def get_dir_size(path: str) -> int:
+def get_dir_size(path: Path) -> int:
     """Return size in bytes."""
     totalbytes = 0
     for root, _dirs, files in os.walk(path):
@@ -3319,12 +3319,12 @@ class PgBin:
         )
         return base_path
 
-    def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn:
+    def get_pg_controldata_checkpoint_lsn(self, pgdata: Path) -> Lsn:
         """
         Run pg_controldata on given datadir and extract checkpoint lsn.
         """
 
-        pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata")
+        pg_controldata_path = self.pg_bin_path / "pg_controldata"
         cmd = f"{pg_controldata_path} -D {pgdata}"
         result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
         checkpoint_lsn = re.findall(
@@ -3433,9 +3433,9 @@ class VanillaPostgres(PgProtocol):
         self.running = False
         self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
 
-    def get_subdir_size(self, subdir) -> int:
+    def get_subdir_size(self, subdir: Path) -> int:
         """Return size of pgdatadir subdirectory in bytes."""
-        return get_dir_size(os.path.join(self.pgdatadir, subdir))
+        return get_dir_size(self.pgdatadir / subdir)
 
     def __enter__(self) -> "VanillaPostgres":
         return self
@@ -3962,7 +3962,7 @@ class Endpoint(PgProtocol, LogUtils):
         self.env = env
         self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
-        self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
+        self.pgdata_dir: Optional[Path] = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
         self.pg_port = pg_port
         self.http_port = http_port
@@ -4019,7 +4019,7 @@ class Endpoint(PgProtocol, LogUtils):
             allow_multiple=allow_multiple,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
-        self.pgdata_dir = os.path.join(self.env.repo_dir, path)
+        self.pgdata_dir = self.env.repo_dir / path
         self.logfile = self.endpoint_path() / "compute.log"
 
         config_lines = config_lines or []
@@ -4072,21 +4072,21 @@ class Endpoint(PgProtocol, LogUtils):
         path = Path("endpoints") / self.endpoint_id
         return self.env.repo_dir / path
 
-    def pg_data_dir_path(self) -> str:
+    def pg_data_dir_path(self) -> Path:
         """Path to Postgres data directory"""
-        return os.path.join(self.endpoint_path(), "pgdata")
+        return self.endpoint_path() / "pgdata"
 
-    def pg_xact_dir_path(self) -> str:
+    def pg_xact_dir_path(self) -> Path:
         """Path to pg_xact dir"""
-        return os.path.join(self.pg_data_dir_path(), "pg_xact")
+        return self.pg_data_dir_path() / "pg_xact"
 
-    def pg_twophase_dir_path(self) -> str:
+    def pg_twophase_dir_path(self) -> Path:
         """Path to pg_twophase dir"""
-        return os.path.join(self.pg_data_dir_path(), "pg_twophase")
+        return self.pg_data_dir_path() / "pg_twophase"
 
-    def config_file_path(self) -> str:
+    def config_file_path(self) -> Path:
         """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
-        return os.path.join(self.endpoint_path(), "postgresql.conf")
+        return self.endpoint_path() / "postgresql.conf"
 
     def config(self, lines: List[str]) -> "Endpoint":
         """
@@ -4251,7 +4251,7 @@ class Endpoint(PgProtocol, LogUtils):
         log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
         self.safe_psql("checkpoint")
         assert self.pgdata_dir is not None  # please mypy
-        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
+        return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
 
     def clear_shared_buffers(self, cursor: Optional[Any] = None):
         """

From c6f56b8462e16284b77a3176801c9ed7364df04b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:15 +0300
Subject: [PATCH 07/49] Remove redundant get_dir_size() function (#9018)

There was another copy of it in utils.py. The only difference is that
the version in utils.py tolerates files that are concurrently
removed. That seems fine for the few callers in neon_fixtures.py too.
---
 test_runner/fixtures/neon_fixtures.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 90a351cdb3..92dcd1e3cd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -93,6 +93,7 @@ from fixtures.utils import (
     allure_add_grafana_links,
     allure_attach_from_dir,
     assert_no_errors,
+    get_dir_size,
     get_self_dir,
     print_gc_result,
     subprocess_capture,
@@ -224,16 +225,6 @@ def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
     return BASE_PORT + worker_seq_no * worker_port_num
 
 
-def get_dir_size(path: Path) -> int:
-    """Return size in bytes."""
-    totalbytes = 0
-    for root, _dirs, files in os.walk(path):
-        for name in files:
-            totalbytes += os.path.getsize(os.path.join(root, name))
-
-    return totalbytes
-
-
 @pytest.fixture(scope="session")
 def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)

From 4295ff0f071789c22d0a838f5512e5caba57d54a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:21 +0300
Subject: [PATCH 08/49] Mark a couple of test fixtures as session-scoped
 (#9018)

pg_distrib_dir doesn't include the Postgres version and only depends
on env variables which cannot change during a test run, so it can be
marked as session-scoped. Similarly, the platform cannot change during
a test run.
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 test_runner/fixtures/parametrize.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 92dcd1e3cd..d2caee5992 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -159,7 +159,7 @@ def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]:
     yield binpath
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="session")
 def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
     if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"):
         distrib_dir = Path(env_postgres_bin).resolve()
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index e2dd51802c..2c8e71526c 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -24,7 +24,7 @@ def build_type() -> Optional[str]:
     return None
 
 
-@pytest.fixture(scope="function", autouse=True)
+@pytest.fixture(scope="session", autouse=True)
 def platform() -> Optional[str]:
     return None
 

From 2db840d8b8736530a9653719ed2560e3647539a0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 01:47:26 +0300
Subject: [PATCH 09/49] Move a few test functions related to auth tokens to
 separate file (#9018)

For readability. neon_fixtures.py is huge.
---
 test_runner/fixtures/auth_tokens.py           | 47 +++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py         | 40 +---------------
 .../regress/test_storage_controller.py        |  2 +-
 3 files changed, 49 insertions(+), 40 deletions(-)
 create mode 100644 test_runner/fixtures/auth_tokens.py

diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py
new file mode 100644
index 0000000000..8ebaf61e5e
--- /dev/null
+++ b/test_runner/fixtures/auth_tokens.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+
+import jwt
+
+from fixtures.common_types import TenantId
+
+
+@dataclass
+class AuthKeys:
+    priv: str
+
+    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
+        token_data = {key: str(val) for key, val in token_data.items()}
+        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
+        # cast(Any, self.priv)
+
+        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
+        # hinting or something (not sure what). If it returned 'bytes', convert it to 'str'
+        # explicitly.
+        if isinstance(token, bytes):
+            token = token.decode()
+
+        return token
+
+    def generate_pageserver_token(self) -> str:
+        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
+
+    def generate_safekeeper_token(self) -> str:
+        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
+
+    # generate token giving access to only one tenant
+    def generate_tenant_token(self, tenant_id: TenantId) -> str:
+        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
+
+
+# TODO: Replace with `StrEnum` when we upgrade to python 3.11
+class TokenScope(str, Enum):
+    ADMIN = "admin"
+    PAGE_SERVER_API = "pageserverapi"
+    GENERATIONS_API = "generations_api"
+    SAFEKEEPER_DATA = "safekeeperdata"
+    TENANT = "tenant"
+    SCRUBBER = "scrubber"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d2caee5992..93b93ff019 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -43,7 +43,6 @@ from urllib.parse import quote, urlparse
 import asyncpg
 import backoff
 import httpx
-import jwt
 import psycopg2
 import psycopg2.sql
 import pytest
@@ -60,6 +59,7 @@ from psycopg2.extensions import make_dsn, parse_dsn
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
+from fixtures.auth_tokens import AuthKeys, TokenScope
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
@@ -373,44 +373,6 @@ class PgProtocol:
         return self.safe_psql(query, log_query=log_query)[0][0]
 
 
-@dataclass
-class AuthKeys:
-    priv: str
-
-    def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str:
-        token_data = {key: str(val) for key, val in token_data.items()}
-        token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA")
-        # cast(Any, self.priv)
-
-        # jwt.encode can return 'bytes' or 'str', depending on Python version or type
-        # hinting or something (not sure what). If it returned 'bytes', convert it to 'str'
-        # explicitly.
-        if isinstance(token, bytes):
-            token = token.decode()
-
-        return token
-
-    def generate_pageserver_token(self) -> str:
-        return self.generate_token(scope=TokenScope.PAGE_SERVER_API)
-
-    def generate_safekeeper_token(self) -> str:
-        return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA)
-
-    # generate token giving access to only one tenant
-    def generate_tenant_token(self, tenant_id: TenantId) -> str:
-        return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id))
-
-
-# TODO: Replace with `StrEnum` when we upgrade to python 3.11
-class TokenScope(str, Enum):
-    ADMIN = "admin"
-    PAGE_SERVER_API = "pageserverapi"
-    GENERATIONS_API = "generations_api"
-    SAFEKEEPER_DATA = "safekeeperdata"
-    TENANT = "tenant"
-    SCRUBBER = "scrubber"
-
-
 class NeonEnvBuilder:
     """
     Builder object to create a Neon runtime environment
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index eea05d7548..2e21f8fb46 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -7,6 +7,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import pytest
+from fixtures.auth_tokens import TokenScope
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
@@ -18,7 +19,6 @@ from fixtures.neon_fixtures import (
     PgBin,
     StorageControllerApiException,
     StorageControllerLeadershipStatus,
-    TokenScope,
     last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient

From b719d58863f90beb46218d1167e4236b05910924 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 17 Sep 2024 09:25:42 +0100
Subject: [PATCH 10/49] storcon: forward requests from stepped down instance to
 the current leader (#8954)

## Problem
It turns out that we can't rely on external orchestration to promptly
route trafic to the new leader. This is downtime inducing.
Forwarding provides a safe way out.

## Safety
We forward when:
1. Request is not one of ["/control/v1/step_down", "/status", "/ready",
"/metrics"]
2. Current instance is in [`LeadershipStatus::SteppedDown`] state
3. There is a leader in the database to forward to
4. Leader from step (3) is not the current instance

If a storcon instance is persisted in the database, then we know that it
is the current leader.
There's one exception: time between handling step-down request and the
new leader updating the
database.

Let's treat the happy case first. The stepped down node does not produce
any side effects,
since all request handling happens on the leader.

As for the edge case, we are guaranteed to always have a maximum of two
running instances.
Hence, if we are in the edge case scenario the leader persisted in the
database is the
stepped down instance that received the request. Condition (4) above
covers this scenario.

## Summary of changes
* Conversion utilities for reqwest <-> hyper. I'm not happy with these,
but I don't see a better way. Open to suggestions.
* Add request forwarding logic
* Update each request handler. Again, not happy with this. If anyone
knows a nice to wrap the handlers, lmk. Me and Joonas tried :/
* Update each handler to maybe forward
* Tweak tests to showcase new behaviour
---
 storage_controller/src/http.rs                | 620 +++++++++++++++++-
 .../regress/test_storage_controller.py        |  22 +-
 2 files changed, 607 insertions(+), 35 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index a6638f5191..1745bf5575 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1,10 +1,11 @@
+use crate::http;
 use crate::metrics::{
     HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
     METRICS_REGISTRY,
 };
 use crate::persistence::SafekeeperPersistence;
 use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT};
 use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
@@ -22,6 +23,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
@@ -87,9 +89,16 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 }
 
 /// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_re_attach(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
@@ -97,9 +106,16 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 
 /// Pageserver calls into this before doing deletions, to confirm that it still
 /// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let validate_req = json_request::<ValidateRequest>(&mut req).await?;
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.validate(validate_req).await?)
@@ -108,9 +124,16 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
 /// Call into this before attaching a tenant to a pageserver, to acquire a generation number
 /// (in the real control plane this is unnecessary, because the same program is managing
 ///  generation numbers and doing attachments).
-async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_attach_hook(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -124,9 +147,16 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
     )
 }
 
-async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_inspect(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let inspect_req = json_request::<InspectRequest>(&mut req).await?;
 
     let state = get_state(&req);
@@ -136,10 +166,17 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
 
 async fn handle_tenant_create(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
 
     json_response(
@@ -150,11 +187,18 @@ async fn handle_tenant_create(
 
 async fn handle_tenant_location_config(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
@@ -166,10 +210,17 @@ async fn handle_tenant_location_config(
 
 async fn handle_tenant_config_set(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
 
     json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
@@ -182,16 +233,30 @@ async fn handle_tenant_config_get(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
 }
 
 async fn handle_tenant_time_travel_remote_storage(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
 
     let timestamp_raw = must_get_query_param(&req, "travel_to")?;
@@ -232,6 +297,13 @@ async fn handle_tenant_secondary_download(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
     json_response(map_reqwest_hyper_status(status)?, progress)
 }
@@ -243,6 +315,13 @@ async fn handle_tenant_delete(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     let status_code = service
         .tenant_delete(tenant_id)
         .await
@@ -258,11 +337,18 @@ async fn handle_tenant_delete(
 
 async fn handle_tenant_timeline_create(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
     json_response(
         StatusCode::CREATED,
@@ -277,9 +363,16 @@ async fn handle_tenant_timeline_delete(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
 
     // For timeline deletions, which both implement an "initially return 202, then 404 once
     // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
@@ -337,12 +430,19 @@ async fn handle_tenant_timeline_delete(
 
 async fn handle_tenant_timeline_archival_config(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
 
     let create_req = json_request::<TimelineArchivalConfigRequest>(&mut req).await?;
 
@@ -358,9 +458,16 @@ async fn handle_tenant_timeline_detach_ancestor(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
     check_permissions(&req, Scope::PageServerApi)?;
 
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
 
     let res = service
         .tenant_timeline_detach_ancestor(tenant_id, timeline_id)
@@ -393,6 +500,13 @@ async fn handle_tenant_timeline_passthrough(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let Some(path) = req.uri().path_and_query() else {
         // This should never happen, our request router only calls us if there is a path
         return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
@@ -460,9 +574,17 @@ async fn handle_tenant_locate(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
     check_permissions(&req, Scope::Admin)?;
 
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
@@ -473,6 +595,14 @@ async fn handle_tenant_describe(
     check_permissions(&req, Scope::Scrubber)?;
 
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
@@ -482,12 +612,26 @@ async fn handle_tenant_list(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, service.tenant_list())
 }
 
-async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
     let state = get_state(&req);
     state.service.node_register(register_req).await?;
@@ -497,6 +641,13 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let nodes = state.service.node_list().await?;
     let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
@@ -507,6 +658,13 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
@@ -515,14 +673,28 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
 }
 
-async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_configure(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
     let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
     if node_id != config_req.node_id {
@@ -548,6 +720,13 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -570,6 +749,13 @@ async fn handle_node_shards(req: Request<Body>) -> Result<Response<Body>, ApiErr
 async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let leader = state.service.get_leader().await.map_err(|err| {
         ApiError::InternalServerError(anyhow::anyhow!(
@@ -583,6 +769,13 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -594,6 +787,13 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
 async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -605,6 +805,13 @@ async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -616,6 +823,13 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
 async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
 
@@ -624,9 +838,16 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
-async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Scrubber)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -640,6 +861,13 @@ async fn handle_metadata_health_list_unhealthy(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
 
@@ -652,10 +880,17 @@ async fn handle_metadata_health_list_unhealthy(
 }
 
 async fn handle_metadata_health_list_outdated(
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
     let state = get_state(&req);
     let health_records = state
@@ -671,10 +906,17 @@ async fn handle_metadata_health_list_outdated(
 
 async fn handle_tenant_shard_split(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
 
@@ -686,10 +928,17 @@ async fn handle_tenant_shard_split(
 
 async fn handle_tenant_shard_migrate(
     service: Arc<Service>,
-    mut req: Request<Body>,
+    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
     json_response(
@@ -700,9 +949,16 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
-async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
     let state = get_state(&req);
@@ -716,9 +972,16 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
     )
 }
 
-async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
     let state = get_state(&req);
 
@@ -731,23 +994,46 @@ async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     json_response(StatusCode::OK, state.service.step_down().await)
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
 async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(
@@ -759,6 +1045,13 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     state.service.tenants_dump()
 }
@@ -766,6 +1059,13 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     state.service.scheduler_dump()
 }
@@ -773,6 +1073,13 @@ async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, Api
 async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.consistency_check().await?)
@@ -781,19 +1088,40 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
 async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
 }
 
 /// Status endpoint is just used for checking that our HTTP listener is up
-async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
     json_response(StatusCode::OK, ())
 }
 
 /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
 /// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
 async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     if state.service.startup_complete.is_ready() {
         json_response(StatusCode::OK, ())
@@ -816,6 +1144,13 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
 
     let id = parse_request_param::<i64>(&req, "id")?;
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     let res = state.service.get_safekeeper(id).await;
@@ -847,6 +1182,13 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
         )));
     }
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
 
     state.service.upsert_safekeeper(body).await?;
@@ -925,10 +1267,7 @@ pub fn prologue_leadership_status_check_middleware<
 
         let allowed_routes = match leadership_status {
             LeadershipStatus::Leader => AllowedRoutes::All,
-            LeadershipStatus::SteppedDown => {
-                // TODO: does it make sense to allow /status here?
-                AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec())
-            }
+            LeadershipStatus::SteppedDown => AllowedRoutes::All,
             LeadershipStatus::Candidate => {
                 AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec())
             }
@@ -1005,6 +1344,13 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
 pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
 
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
     let state = get_state(&req);
     let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
     let response = Response::builder()
@@ -1032,6 +1378,220 @@ where
     request_span(request, handler).await
 }
 
+enum ForwardOutcome {
+    Forwarded(Result<Response<Body>, ApiError>),
+    NotForwarded(Request<Body>),
+}
+
+/// Potentially forward the request to the current storage controler leader.
+/// More specifically we forward when:
+/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"]
+/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state
+/// 3. There is a leader in the database to forward to
+/// 4. Leader from step (3) is not the current instance
+///
+/// Why forward?
+/// It turns out that we can't rely on external orchestration to promptly route trafic to the
+/// new leader. This is downtime inducing. Forwarding provides a safe way out.
+///
+/// Why is it safe?
+/// If a storcon instance is persisted in the database, then we know that it is the current leader.
+/// There's one exception: time between handling step-down request and the new leader updating the
+/// database.
+///
+/// Let's treat the happy case first. The stepped down node does not produce any side effects,
+/// since all request handling happens on the leader.
+///
+/// As for the edge case, we are guaranteed to always have a maximum of two running instances.
+/// Hence, if we are in the edge case scenario the leader persisted in the database is the
+/// stepped down instance that received the request. Condition (4) above covers this scenario.
+async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
+    const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"];
+
+    let uri = req.uri().to_string();
+    let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
+
+    let state = get_state(&req);
+    let leadership_status = state.service.get_leadership_status();
+
+    if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward {
+        return ForwardOutcome::NotForwarded(req);
+    }
+
+    let leader = state.service.get_leader().await;
+    let leader = {
+        match leader {
+            Ok(Some(leader)) => leader,
+            Ok(None) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable(
+                    "No leader to forward to while in stepped down state".into(),
+                )));
+            }
+            Err(err) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
+                    anyhow::anyhow!(
+                        "Failed to get leader for forwarding while in stepped down state: {err}"
+                    ),
+                )));
+            }
+        }
+    };
+
+    let cfg = state.service.get_config();
+    if let Some(ref self_addr) = cfg.address_for_peers {
+        let leader_addr = match Uri::from_str(leader.address.as_str()) {
+            Ok(uri) => uri,
+            Err(err) => {
+                return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(
+                    anyhow::anyhow!(
+                    "Failed to parse leader uri for forwarding while in stepped down state: {err}"
+                ),
+                )));
+            }
+        };
+
+        if *self_addr == leader_addr {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Leader is stepped down instance"
+            ))));
+        }
+    }
+
+    tracing::info!("Forwarding {} to leader at {}", uri, leader.address);
+
+    // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and
+    // include some leeway to get the timeout for proxied requests.
+    const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10);
+    let client = reqwest::ClientBuilder::new()
+        .timeout(PROXIED_REQUEST_TIMEOUT)
+        .build();
+    let client = match client {
+        Ok(client) => client,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to build leader client for forwarding while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    let request: reqwest::Request = match convert_request(req, &client, leader.address).await {
+        Ok(r) => r,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to convert request for forwarding while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    let response = match client.execute(request).await {
+        Ok(r) => r,
+        Err(err) => {
+            return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Failed to forward while in stepped down state: {err}"
+            ))));
+        }
+    };
+
+    ForwardOutcome::Forwarded(convert_response(response).await)
+}
+
+/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through
+/// a stable representation (string, bytes or integer)
+///
+/// Ideally, we would not have to do this since both types use the http crate
+/// under the hood. However, they use different versions of the crate and keeping
+/// second order dependencies in sync is difficult.
+async fn convert_response(resp: reqwest::Response) -> Result<hyper::Response<Body>, ApiError> {
+    use std::str::FromStr;
+
+    let mut builder = hyper::Response::builder().status(resp.status().as_u16());
+    for (key, value) in resp.headers().into_iter() {
+        let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+        })?;
+
+        let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+        })?;
+
+        builder = builder.header(key, value);
+    }
+
+    let body = http::Body::wrap_stream(resp.bytes_stream());
+
+    builder.body(body).map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}"))
+    })
+}
+
+/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through
+/// a stable representation (string, bytes or integer)
+///
+/// See [`convert_response`] for why we are doing it this way.
+async fn convert_request(
+    req: hyper::Request<Body>,
+    client: &reqwest::Client,
+    to_address: String,
+) -> Result<reqwest::Request, ApiError> {
+    use std::str::FromStr;
+
+    let (parts, body) = req.into_parts();
+    let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    let path_and_query = parts.uri.path_and_query().ok_or_else(|| {
+        ApiError::InternalServerError(anyhow::anyhow!(
+            "Request conversion failed: no path and query"
+        ))
+    })?;
+
+    let uri = reqwest::Url::from_str(
+        format!(
+            "{}{}",
+            to_address.trim_end_matches("/"),
+            path_and_query.as_str()
+        )
+        .as_str(),
+    )
+    .map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    let mut headers = reqwest::header::HeaderMap::new();
+    for (key, value) in parts.headers.into_iter() {
+        let key = match key {
+            Some(k) => k,
+            None => {
+                continue;
+            }
+        };
+
+        let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })?;
+
+        let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })?;
+
+        headers.insert(key, value);
+    }
+
+    let body = hyper::body::to_bytes(body).await.map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+    })?;
+
+    client
+        .request(method, uri)
+        .headers(headers)
+        .body(body)
+        .build()
+        .map_err(|err| {
+            ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}"))
+        })
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2e21f8fb46..2d72dbb2df 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2048,8 +2048,11 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     # Make a change to the tenant config to trigger a slow reconcile
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
     virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
-    env.storage_controller.allowed_errors.append(
-        ".*Accepted configuration update but reconciliation failed.*"
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Accepted configuration update but reconciliation failed.*",
+            ".*Leader is stepped down instance",
+        ]
     )
 
     observed_state = env.storage_controller.step_down()
@@ -2072,9 +2075,9 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     assert "compaction_threshold" in ps_tenant_conf.effective_config
     assert ps_tenant_conf.effective_config["compaction_threshold"] == 5
 
-    # Validate that the storcon is not replying to the usual requests
-    # once it has stepped down.
-    with pytest.raises(StorageControllerApiException, match="stepped_down"):
+    # Validate that the storcon attempts to forward the request, but stops.
+    # when it realises it is still the current leader.
+    with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"):
         env.storage_controller.tenant_list()
 
     # Validate that we can step down multiple times and the observed state
@@ -2221,6 +2224,15 @@ def test_storage_controller_leadership_transfer(
     env.storage_controller.wait_until_ready()
     env.storage_controller.consistency_check()
 
+    if not step_down_times_out:
+        # Check that the stepped down instance forwards requests
+        # to the new leader while it's still running.
+        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+        env.storage_controller.tenant_list()
+        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+        status = env.storage_controller.node_status(env.pageservers[0].id)
+        assert status["scheduling"] == "Pause"
+
     if step_down_times_out:
         env.storage_controller.allowed_errors.extend(
             [

From cd4276fd656b9e917ee315240d957d344fc2cfe6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 10:17:48 +0100
Subject: [PATCH 11/49] CI: fix release pipeline (#9017)

## Problem

We've got 2 non-blocking failures on the release pipeline:
- `promote-compatibility-data` job got skipped _presumably_ because one
of the dependencies of `deploy` job (`push-to-acr-dev`) got skipped
(https://github.com/neondatabase/neon/pull/8940)
- `coverage-report` job fails because we don't build debug artifacts in
the release branch (https://github.com/neondatabase/neon/pull/8561)

## Summary of changes
- Always run `push-to-acr-dev` / `push-to-acr-prod` jobs, but add
`skip_if` parameter to the reusable workflow, which can skip the job
internally, without skipping externally
- Do not run `coverage-report` on release branches
---
 .github/workflows/_push-to-acr.yml   | 8 +++++++-
 .github/workflows/build_and_test.yml | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
index 415b3d9cc6..7b6eba2c06 100644
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -26,9 +26,15 @@ on:
         description: Azure tenant ID
         required: true
         type: string
+      skip_if:
+        description: Skip the job if this expression is true
+        required: true
+        type: boolean
 
 jobs:
   push-to-acr:
+    if: ${{ !inputs.skip_if }}
+
     runs-on: ubuntu-22.04
     permissions:
       contents: read  # This is required for actions/checkout
@@ -52,5 +58,5 @@ jobs:
           for image in ${images}; do
             docker buildx imagetools create \
               -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                        neondatabase/${image}:${{ inputs.image_tag }}
+                                                        neondatabase/${image}:${{ inputs.image_tag }}
           done
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7c06fd9ab8..7ddd624dd5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -357,6 +357,7 @@ jobs:
             })
 
   coverage-report:
+    if: ${{ !startsWith(github.ref_name, 'release') }}
     needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
     runs-on: [ self-hosted, small ]
     container:
@@ -858,7 +859,6 @@ jobs:
           done
 
   push-to-acr-dev:
-    if: github.ref_name == 'main'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -868,9 +868,9 @@ jobs:
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
+      skip_if: ${{ github.ref_name != 'main' }}
 
   push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -880,6 +880,7 @@ jobs:
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
+      skip_if: ${{ !startsWith(github.ref_name, 'release') }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
@@ -957,7 +958,7 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest

From d211f00f054b80515684ff0887b76ecec60cc796 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 17 Sep 2024 17:55:45 +0300
Subject: [PATCH 12/49] Remove unnecessary dependencies (#9000)

Found by "cargo machete"
---
 Cargo.lock                             | 144 ++-----------------------
 compute_tools/Cargo.toml               |   3 -
 control_plane/Cargo.toml               |   5 -
 control_plane/storcon_cli/Cargo.toml   |   3 -
 libs/compute_api/Cargo.toml            |   1 -
 libs/consumption_metrics/Cargo.toml    |   3 -
 libs/desim/Cargo.toml                  |   1 -
 libs/postgres_backend/Cargo.toml       |   2 -
 libs/postgres_ffi/Cargo.toml           |   3 -
 libs/postgres_ffi/wal_craft/Cargo.toml |   1 -
 libs/pq_proto/Cargo.toml               |   2 -
 libs/remote_storage/Cargo.toml         |   3 -
 libs/safekeeper_api/Cargo.toml         |   1 -
 libs/tracing-utils/Cargo.toml          |   5 +-
 libs/utils/Cargo.toml                  |   1 -
 libs/vm_monitor/Cargo.toml             |   2 -
 pageserver/Cargo.toml                  |   9 --
 pageserver/compaction/Cargo.toml       |  22 ----
 pageserver/ctl/Cargo.toml              |   2 -
 proxy/Cargo.toml                       |   7 --
 safekeeper/Cargo.toml                  |   5 -
 storage_broker/Cargo.toml              |   2 -
 storage_controller/Cargo.toml          |   2 -
 storage_controller/client/Cargo.toml   |  11 --
 storage_scrubber/Cargo.toml            |   8 --
 workspace_hack/Cargo.toml              |   3 +
 26 files changed, 15 insertions(+), 236 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3ca6acbc3e..136f07956f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1209,7 +1209,6 @@ dependencies = [
  "remote_storage",
  "serde",
  "serde_json",
- "serde_with",
  "utils",
 ]
 
@@ -1218,7 +1217,6 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-compression",
  "bytes",
  "cfg-if",
  "chrono",
@@ -1237,7 +1235,6 @@ dependencies = [
  "reqwest 0.12.4",
  "rlimit",
  "rust-ini",
- "serde",
  "serde_json",
  "signal-hook",
  "tar",
@@ -1246,7 +1243,6 @@ dependencies = [
  "tokio-postgres",
  "tokio-stream",
  "tokio-util",
- "toml_edit",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -1317,12 +1313,9 @@ dependencies = [
 name = "consumption_metrics"
 version = "0.1.0"
 dependencies = [
- "anyhow",
  "chrono",
  "rand 0.8.5",
  "serde",
- "serde_with",
- "utils",
 ]
 
 [[package]]
@@ -1334,9 +1327,7 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
- "futures",
  "git-version",
- "hex",
  "humantime",
  "humantime-serde",
  "hyper 0.14.26",
@@ -1344,7 +1335,6 @@ dependencies = [
  "once_cell",
  "pageserver_api",
  "pageserver_client",
- "postgres",
  "postgres_backend",
  "postgres_connection",
  "regex",
@@ -1353,9 +1343,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
- "serde_with",
  "storage_broker",
- "tar",
  "thiserror",
  "tokio",
  "tokio-postgres",
@@ -1663,7 +1651,6 @@ dependencies = [
  "hex",
  "parking_lot 0.12.1",
  "rand 0.8.5",
- "scopeguard",
  "smallvec",
  "tracing",
  "utils",
@@ -2233,24 +2220,22 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "git-version"
-version = "0.3.5"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899"
+checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19"
 dependencies = [
  "git-version-macro",
- "proc-macro-hack",
 ]
 
 [[package]]
 name = "git-version-macro"
-version = "0.3.5"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f"
+checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
- "proc-macro-hack",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -2744,19 +2729,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "inotify"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
-dependencies = [
- "bitflags 1.3.2",
- "futures-core",
- "inotify-sys",
- "libc",
- "tokio",
-]
-
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -3251,7 +3223,7 @@ dependencies = [
  "crossbeam-channel",
  "filetime",
  "fsevent-sys",
- "inotify 0.9.6",
+ "inotify",
  "kqueue",
  "libc",
  "log",
@@ -3642,7 +3614,6 @@ name = "pagectl"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bytes",
  "camino",
  "clap",
  "git-version",
@@ -3651,7 +3622,6 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
- "serde",
  "serde_json",
  "svg_fmt",
  "thiserror",
@@ -3670,7 +3640,6 @@ dependencies = [
  "arc-swap",
  "async-compression",
  "async-stream",
- "async-trait",
  "bit_field",
  "byteorder",
  "bytes",
@@ -3678,16 +3647,13 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
- "const_format",
  "consumption_metrics",
  "crc32c",
  "criterion",
- "crossbeam-utils",
  "either",
  "enum-map",
  "enumset",
  "fail",
- "flate2",
  "futures",
  "git-version",
  "hex",
@@ -3726,13 +3692,9 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "serde_with",
- "signal-hook",
- "smallvec",
  "storage_broker",
  "strum",
  "strum_macros",
- "svg_fmt",
- "sync_wrapper",
  "sysinfo",
  "tenant_size_model",
  "thiserror",
@@ -3746,7 +3708,6 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
- "twox-hash",
  "url",
  "utils",
  "walkdir",
@@ -3810,44 +3771,22 @@ name = "pageserver_compaction"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-compression",
  "async-stream",
- "byteorder",
- "bytes",
- "chrono",
  "clap",
- "const_format",
- "consumption_metrics",
  "criterion",
- "crossbeam-utils",
- "either",
- "fail",
- "flate2",
  "futures",
  "git-version",
- "hex",
  "hex-literal",
- "humantime",
- "humantime-serde",
  "itertools 0.10.5",
- "metrics",
  "once_cell",
  "pageserver_api",
  "pin-project-lite",
  "rand 0.8.5",
- "smallvec",
  "svg_fmt",
- "sync_wrapper",
- "thiserror",
  "tokio",
- "tokio-io-timeout",
- "tokio-util",
  "tracing",
- "tracing-error",
  "tracing-subscriber",
- "url",
  "utils",
- "walkdir",
  "workspace_hack",
 ]
 
@@ -4164,9 +4103,7 @@ name = "postgres_backend"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "async-trait",
  "bytes",
- "futures",
  "once_cell",
  "pq_proto",
  "rustls 0.22.4",
@@ -4199,16 +4136,13 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "bindgen",
- "byteorder",
  "bytes",
  "crc32c",
  "env_logger",
- "hex",
  "log",
  "memoffset 0.8.0",
  "once_cell",
  "postgres",
- "rand 0.8.5",
  "regex",
  "serde",
  "thiserror",
@@ -4243,13 +4177,11 @@ dependencies = [
  "byteorder",
  "bytes",
  "itertools 0.10.5",
- "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
  "serde",
  "thiserror",
  "tokio",
- "tracing",
 ]
 
 [[package]]
@@ -4281,12 +4213,6 @@ dependencies = [
  "elliptic-curve 0.13.8",
 ]
 
-[[package]]
-name = "proc-macro-hack"
-version = "0.5.20+deprecated"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
-
 [[package]]
 name = "proc-macro2"
 version = "1.0.78"
@@ -4405,7 +4331,6 @@ dependencies = [
  "aws-config",
  "aws-sdk-iam",
  "aws-sigv4",
- "aws-types",
  "base64 0.13.1",
  "bstr",
  "bytes",
@@ -4414,7 +4339,6 @@ dependencies = [
  "chrono",
  "clap",
  "consumption_metrics",
- "crossbeam-deque",
  "dashmap",
  "ecdsa 0.16.9",
  "env_logger",
@@ -4440,11 +4364,9 @@ dependencies = [
  "jose-jwa",
  "jose-jwk",
  "lasso",
- "md5",
  "measured",
  "metrics",
  "once_cell",
- "opentelemetry",
  "p256 0.13.2",
  "parking_lot 0.12.1",
  "parquet",
@@ -4465,7 +4387,6 @@ dependencies = [
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
- "routerify",
  "rsa",
  "rstest",
  "rustc-hash",
@@ -4481,7 +4402,6 @@ dependencies = [
  "smol_str",
  "socket2 0.5.5",
  "subtle",
- "task-local-extensions",
  "thiserror",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
@@ -4491,7 +4411,6 @@ dependencies = [
  "tokio-rustls 0.25.0",
  "tokio-tungstenite",
  "tokio-util",
- "tower-service",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -4781,7 +4700,6 @@ dependencies = [
  "async-stream",
  "async-trait",
  "aws-config",
- "aws-credential-types",
  "aws-sdk-s3",
  "aws-smithy-async",
  "aws-smithy-types",
@@ -4795,7 +4713,6 @@ dependencies = [
  "futures",
  "futures-util",
  "http-types",
- "humantime",
  "humantime-serde",
  "hyper 0.14.26",
  "itertools 0.10.5",
@@ -5275,14 +5192,12 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-stream",
- "async-trait",
  "byteorder",
  "bytes",
  "camino",
  "camino-tempfile",
  "chrono",
  "clap",
- "const_format",
  "crc32c",
  "desim",
  "fail",
@@ -5308,9 +5223,7 @@ dependencies = [
  "sd-notify",
  "serde",
  "serde_json",
- "serde_with",
  "sha2",
- "signal-hook",
  "storage_broker",
  "strum",
  "strum_macros",
@@ -5321,7 +5234,6 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
- "toml_edit",
  "tracing",
  "tracing-subscriber",
  "url",
@@ -5336,7 +5248,6 @@ version = "0.1.0"
 dependencies = [
  "const_format",
  "serde",
- "serde_with",
  "utils",
 ]
 
@@ -5865,7 +5776,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-stream",
- "bytes",
  "clap",
  "const_format",
  "futures",
@@ -5879,7 +5789,6 @@ dependencies = [
  "parking_lot 0.12.1",
  "prost",
  "tokio",
- "tokio-stream",
  "tonic",
  "tonic-build",
  "tracing",
@@ -5892,9 +5801,7 @@ name = "storage_controller"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "aws-config",
  "bytes",
- "camino",
  "chrono",
  "clap",
  "control_plane",
@@ -5935,20 +5842,9 @@ dependencies = [
 name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
- "anyhow",
- "bytes",
- "futures",
- "pageserver_api",
  "pageserver_client",
- "postgres",
  "reqwest 0.12.4",
  "serde",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-stream",
- "tokio-util",
- "utils",
  "workspace_hack",
 ]
 
@@ -5960,13 +5856,9 @@ dependencies = [
  "async-stream",
  "aws-config",
  "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
  "camino",
  "chrono",
  "clap",
- "crc32c",
  "either",
  "futures",
  "futures-util",
@@ -5978,20 +5870,16 @@ dependencies = [
  "pageserver",
  "pageserver_api",
  "postgres_ffi",
- "rand 0.8.5",
  "remote_storage",
  "reqwest 0.12.4",
  "rustls 0.22.4",
  "rustls-native-certs 0.7.0",
  "serde",
  "serde_json",
- "serde_with",
  "storage_controller_client",
- "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
  "tokio-stream",
  "tokio-util",
  "tracing",
@@ -6010,14 +5898,11 @@ dependencies = [
  "comfy-table",
  "futures",
  "humantime",
- "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
  "reqwest 0.12.4",
- "serde",
  "serde_json",
  "storage_controller_client",
- "thiserror",
  "tokio",
  "tracing",
  "utils",
@@ -6140,15 +6025,6 @@ dependencies = [
  "xattr",
 ]
 
-[[package]]
-name = "task-local-extensions"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8"
-dependencies = [
- "pin-utils",
-]
-
 [[package]]
 name = "tempfile"
 version = "3.9.0"
@@ -6739,7 +6615,6 @@ dependencies = [
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
- "reqwest 0.12.4",
  "tokio",
  "tracing",
  "tracing-opentelemetry",
@@ -6943,7 +6818,6 @@ dependencies = [
  "serde_assert",
  "serde_json",
  "serde_path_to_error",
- "serde_with",
  "signal-hook",
  "strum",
  "strum_macros",
@@ -6999,13 +6873,11 @@ dependencies = [
  "cgroups-rs",
  "clap",
  "futures",
- "inotify 0.10.2",
  "serde",
  "serde_json",
  "sysinfo",
  "tokio",
  "tokio-postgres",
- "tokio-stream",
  "tokio-util",
  "tracing",
  "tracing-subscriber",
@@ -7032,7 +6904,6 @@ dependencies = [
  "clap",
  "env_logger",
  "log",
- "once_cell",
  "postgres",
  "postgres_ffi",
  "regex",
@@ -7555,6 +7426,7 @@ dependencies = [
  "digest",
  "either",
  "fail",
+ "futures",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7610,6 +7482,8 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
  "url",
  "uuid",
  "zeroize",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8af0ed43ce..00a82e4be6 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,7 +11,6 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
@@ -24,7 +23,6 @@ num_cpus.workspace = true
 opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
-serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
@@ -43,7 +41,6 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
-toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 6fca59b368..c185d20484 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,13 +9,10 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-futures.workspace = true
 git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
-postgres.workspace = true
-hex.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
@@ -23,8 +20,6 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
-tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index be69208d0d..ce89116691 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -11,14 +11,11 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
-hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
-serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
-thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
index 8aaa481f8c..c0ec40a6c2 100644
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 serde.workspace = true
-serde_with.workspace = true
 serde_json.workspace = true
 regex.workspace = true
 
diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml
index a40b74b952..0e517e3856 100644
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -5,9 +5,6 @@ edition = "2021"
 license = "Apache-2.0"
 
 [dependencies]
-anyhow.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
-serde_with.workspace = true
-utils.workspace = true
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
index 0c4be90267..473f3a2a13 100644
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -12,5 +12,4 @@ bytes.workspace = true
 utils.workspace = true
 parking_lot.workspace = true
 hex.workspace = true
-scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index f6854328fc..a0c87263ed 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -5,10 +5,8 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-async-trait.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
-futures.workspace = true
 rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index ee69878f69..ef17833a48 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -5,13 +5,10 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-rand.workspace = true
 regex.workspace = true
 bytes.workspace = true
-byteorder.workspace = true
 anyhow.workspace = true
 crc32c.workspace = true
-hex.workspace = true
 once_cell.workspace = true
 log.workspace = true
 memoffset.workspace = true
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 29dd01a936..14c7d2e340 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -9,7 +9,6 @@ anyhow.workspace = true
 clap.workspace = true
 env_logger.workspace = true
 log.workspace = true
-once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 66bbe03ebc..9524a1490d 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -8,10 +8,8 @@ license.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
 itertools.workspace = true
-pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
-tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 02adee058f..f48f1801a4 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,14 +13,11 @@ aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
-aws-credential-types.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
-humantime.workspace = true
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
-rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index e1f4bcca46..14811232d3 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -6,6 +6,5 @@ license.workspace = true
 
 [dependencies]
 serde.workspace = true
-serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 5ea8db6b42..05eb538d42 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -9,8 +9,9 @@ hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
 opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
-reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
-tracing-subscriber.workspace = true
+
+[dev-dependencies]
+tracing-subscriber.workspace = true    # For examples in docs
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 19deaab63f..f199b15554 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -42,7 +42,6 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
-serde_with.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml
index 46e9f880a1..ba73902d38 100644
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -15,13 +15,11 @@ anyhow.workspace = true
 axum.workspace = true
 clap.workspace = true
 futures.workspace = true
-inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
-tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 24373afca3..0eb48d6823 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -15,7 +15,6 @@ anyhow.workspace = true
 arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
-async-trait.workspace = true
 bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
@@ -23,12 +22,9 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
-crossbeam-utils.workspace = true
 either.workspace = true
-flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -57,10 +53,6 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
-signal-hook.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
@@ -73,7 +65,6 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
-twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml
index 0fd1d81845..52b58fc298 100644
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -9,41 +9,19 @@ default = []
 
 [dependencies]
 anyhow.workspace = true
-async-compression.workspace = true
 async-stream.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
 itertools.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pin-project-lite.workspace = true
 rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
 tracing.workspace = true
-tracing-error.workspace = true
 tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index be5626040b..9592002de1 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
-bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
@@ -24,5 +23,4 @@ toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
-serde.workspace = true
 serde_json.workspace = true
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 21d92abb20..6703eb06eb 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -18,7 +18,6 @@ atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
-aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -26,7 +25,6 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
-crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -48,11 +46,9 @@ indexmap.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
-md5.workspace = true
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
-opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -67,7 +63,6 @@ reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
-routerify.workspace = true
 rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -79,7 +74,6 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
-task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
@@ -88,7 +82,6 @@ tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
-tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 0fdb3147bf..daf21c70b0 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -13,14 +13,12 @@ testing = ["fail/failpoints"]
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
-async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
 camino-tempfile.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
-const_format.workspace = true
 crc32c.workspace = true
 fail.workspace = true
 git-version.workspace = true
@@ -38,8 +36,6 @@ scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
-signal-hook.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
@@ -48,7 +44,6 @@ tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-tar.workspace = true
-toml_edit.workspace = true
 tracing.workspace = true
 url.workspace = true
 metrics.workspace = true
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index ac4b00669e..82ec0aa272 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -10,7 +10,6 @@ bench = []
 [dependencies]
 anyhow.workspace = true
 async-stream.workspace = true
-bytes.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 futures.workspace = true
@@ -24,7 +23,6 @@ parking_lot.workspace = true
 prost.workspace = true
 tonic.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
-tokio-stream.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index ecaac04915..a96d64e096 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -15,9 +15,7 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
-aws-config.workspace = true
 bytes.workspace = true
-camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml
index e7a4264fd0..9fa89176af 100644
--- a/storage_controller/client/Cargo.toml
+++ b/storage_controller/client/Cargo.toml
@@ -5,18 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-pageserver_api.workspace = true
 pageserver_client.workspace = true
-thiserror.workspace = true
 reqwest.workspace = true
-utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
-tokio-postgres.workspace = true
-tokio-stream.workspace = true
-tokio.workspace = true
-futures.workspace = true
-tokio-util.workspace = true
-anyhow.workspace = true
-postgres.workspace = true
-bytes.workspace = true
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index d19119990b..f9987662b9 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -6,21 +6,13 @@ license.workspace = true
 
 [dependencies]
 aws-sdk-s3.workspace = true
-aws-smithy-async.workspace = true
 either.workspace = true
-tokio-rustls.workspace = true
 anyhow.workspace = true
 git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
-thiserror.workspace = true
-rand.workspace = true
-bytes.workspace = true
-bincode.workspace = true
-crc32c.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 140c43639e..662916d42c 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,6 +38,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -88,6 +89,8 @@ tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive", "serde"] }

From 6138eb50e9fc5fb6bc7af57cb95cbad8aaedc186 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 16 Sep 2024 17:53:57 -0500
Subject: [PATCH 13/49] Fix test code related to migrations

We added another migration in 5876c441abc973acca60882192ad46333c075abd,
but didn't bump this value. This had no effect, but best to fix it
anyway.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 93b93ff019..69284bfdfc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4094,7 +4094,7 @@ class Endpoint(PgProtocol, LogUtils):
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     # Please note: Migrations only run if pg_skip_catalog_updates is false
-    def wait_for_migrations(self, num_migrations: int = 10):
+    def wait_for_migrations(self, num_migrations: int = 11):
         with self.cursor() as cur:
 
             def check_migrations_done():

From a1b71b73fe8f1193ef841604af962053c0bf1061 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 17 Sep 2024 19:15:01 +0200
Subject: [PATCH 14/49] Rename some S3 usages to "remote storage" in exposed
 messages (#8999)

In exposed messages like log messages we mentioned "S3", which is not
entirely accurate as we support Azure blob storage now as well.
---
 pageserver/src/consumption_metrics.rs                    | 2 +-
 pageserver/src/metrics.rs                                | 2 +-
 test_runner/fixtures/pageserver/many_tenants.py          | 2 +-
 test_runner/regress/test_pageserver_metric_collection.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 64a267e0e4..0c7630edca 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -178,7 +178,7 @@ async fn collect_metrics(
                 )
                 .await;
                 if let Err(e) = res {
-                    tracing::error!("failed to upload to S3: {e:#}");
+                    tracing::error!("failed to upload to remote storage: {e:#}");
                 }
             }
         };
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9197505876..72229d80be 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1777,7 +1777,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
     .expect("failed to define a metric"),
     upload_heatmap_duration: register_histogram!(
         "pageserver_secondary_upload_heatmap_duration",
-        "Time to build and upload a heatmap, including any waiting inside the S3 client"
+        "Time to build and upload a heatmap, including any waiting inside the remote storage client"
     )
     .expect("failed to define a metric"),
     download_heatmap: register_int_counter!(
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 3e0ffabf74..97e63ed4ba 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -39,7 +39,7 @@ def single_timeline(
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
 
-    log.info(f"duplicating template tenant {ncopies} times in S3")
+    log.info(f"duplicating template tenant {ncopies} times in remote storage")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
 
     # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 24a37b04ec..37ab51f9fb 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -74,7 +74,7 @@ def test_metric_collection(
     env.pageserver.allowed_errors.extend(
         [
             ".*metrics endpoint refused the sent metrics*",
-            ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
+            ".*metrics_collection: failed to upload to remote storage: Failed to upload data of length .* to storage path.*",
         ]
     )
 

From d78f5ce6da2866cb982203d1bab33c49a990cd5f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 18:40:05 +0100
Subject: [PATCH 15/49] CI: don't fetch the whole git history if it's not
 required (#9021)

## Problem
We do use `actions/checkout` with `fetch-depth: 0` when it's not
required

## Summary of changes
- Remove unneeded `fetch-depth: 0`
- Add a comment if `fetch-depth: 0` is required
---
 .github/workflows/build_and_test.yml    | 31 +++++++------------------
 .github/workflows/trigger-e2e-tests.yml |  4 ++--
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7ddd624dd5..d46b8dc1f5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -54,8 +54,8 @@ jobs:
       build-tag: ${{steps.build-tag.outputs.tag}}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -374,8 +374,8 @@ jobs:
         coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
         coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
+      - uses: actions/checkout@v4
         with:
           submodules: true
           fetch-depth: 0
@@ -476,11 +476,9 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 0
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
@@ -555,11 +553,9 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
         with:
           submodules: true
-          fetch-depth: 0
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/setup-buildx-action@v3
@@ -706,10 +702,7 @@ jobs:
       VM_BUILDER_VERSION: v0.29.3
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - name: Downloading vm-builder
         run: |
@@ -749,10 +742,7 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - uses: ./.github/actions/set-docker-config-dir
       - uses: docker/login-action@v3
@@ -977,10 +967,7 @@ jobs:
             git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
           done
 
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
+      - uses: actions/checkout@v4
 
       - name: Trigger deploy workflow
         env:
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 6fbe785c56..b299cf9b99 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -34,8 +34,8 @@ jobs:
       build-tag: ${{ steps.build-tag.outputs.tag }}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 

From 3cd2a3f9317f54e75c00a813680060dcaa612a36 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 17 Sep 2024 20:16:33 +0100
Subject: [PATCH 16/49] refactor(walredo): process launch & kill-on-error
 machinery (#8951)

Immediate benefit: easier to spot what's going on.

Later benefit: use the extracted method in PR

- https://github.com/neondatabase/neon/pull/8952

which adds a `ping` command to walredo.

Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.
---
 pageserver/src/walredo.rs | 255 +++++++++++++++++++++-----------------
 1 file changed, 140 insertions(+), 115 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a36955fa21..0fe7def8b0 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -35,6 +35,7 @@ use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
+use std::future::Future;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
@@ -296,6 +297,97 @@ impl PostgresRedoManager {
         }
     }
 
+    async fn do_with_walredo_process<
+        F: FnOnce(Arc<Process>) -> Fut,
+        Fut: Future<Output = Result<O, Error>>,
+        O,
+    >(
+        &self,
+        pg_version: u32,
+        closure: F,
+    ) -> Result<O, Error> {
+        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+            Ok(guard) => match &*guard {
+                ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                ProcessOnceCell::ManagerShutDown => {
+                    return Err(Error::Cancelled);
+                }
+            },
+            Err(permit) => {
+                let start = Instant::now();
+                // acquire guard before spawning process, so that we don't spawn new processes
+                // if the gate is already closed.
+                let _launched_processes_guard = match self.launched_processes.enter() {
+                    Ok(guard) => guard,
+                    Err(GateError::GateClosed) => unreachable!(
+                        "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                    ),
+                };
+                let proc = Arc::new(Process {
+                    process: process::WalRedoProcess::launch(
+                        self.conf,
+                        self.tenant_shard_id,
+                        pg_version,
+                    )
+                    .context("launch walredo process")?,
+                    _launched_processes_guard,
+                });
+                let duration = start.elapsed();
+                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                info!(
+                    elapsed_ms = duration.as_millis(),
+                    pid = proc.id(),
+                    "launched walredo process"
+                );
+                self.redo_process
+                    .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                proc
+            }
+        };
+
+        // async closures are unstable, would support &Process
+        let result = closure(proc.clone()).await;
+
+        if result.is_err() {
+            // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+            // Note that there may be other tasks concurrent with us that also hold `proc`.
+            // We have to deal with that here.
+            // Also read the doc comment on field `self.redo_process`.
+            //
+            // NB: there may still be other concurrent threads using `proc`.
+            // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
+            //
+            // NB: the drop impl blocks the dropping thread with a wait() system call for
+            // the child process. In some ways the blocking is actually good: if we
+            // deferred the waiting into the background / to tokio if we used `tokio::process`,
+            // it could happen that if walredo always fails immediately, we spawn processes faster
+            // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
+            // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
+            // This probably needs revisiting at some later point.
+            match self.redo_process.get() {
+                None => (),
+                Some(guard) => {
+                    match &*guard {
+                        ProcessOnceCell::ManagerShutDown => {}
+                        ProcessOnceCell::Spawned(guard_proc) => {
+                            if Arc::ptr_eq(&proc, guard_proc) {
+                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                guard.take_and_deinit();
+                            } else {
+                                // Another task already spawned another redo process (further up in this method)
+                                // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                            }
+                        }
+                    }
+                }
+            }
+            // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
+            drop(proc);
+        }
+
+        result
+    }
+
     ///
     /// Process one request for WAL redo using wal-redo postgres
     ///
@@ -319,130 +411,63 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
-                Ok(guard) => match &*guard {
-                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
-                    ProcessOnceCell::ManagerShutDown => {
-                        return Err(Error::Cancelled);
-                    }
-                },
-                Err(permit) => {
-                    let start = Instant::now();
-                    // acquire guard before spawning process, so that we don't spawn new processes
-                    // if the gate is already closed.
-                    let _launched_processes_guard = match self.launched_processes.enter() {
-                                Ok(guard) => guard,
-                                Err(GateError::GateClosed) => unreachable!(
-                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
-                                ),
-                            };
-                    let proc = Arc::new(Process {
-                        process: process::WalRedoProcess::launch(
-                            self.conf,
-                            self.tenant_shard_id,
-                            pg_version,
-                        )
-                        .context("launch walredo process")?,
-                        _launched_processes_guard,
-                    });
-                    let duration = start.elapsed();
-                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                    info!(
-                        duration_ms = duration.as_millis(),
-                        pid = proc.id(),
-                        "launched walredo process"
-                    );
-                    self.redo_process
-                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
-                    proc
-                }
-            };
+            let base_img = &base_img;
+            let closure = |proc: Arc<Process>| async move {
+                let started_at = std::time::Instant::now();
 
-            let started_at = std::time::Instant::now();
+                // Relational WAL records are applied using wal-redo-postgres
+                let result = proc
+                    .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
+                    .context("apply_wal_records");
 
-            // Relational WAL records are applied using wal-redo-postgres
-            let result = proc
-                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
-                .await
-                .context("apply_wal_records");
+                let duration = started_at.elapsed();
 
-            let duration = started_at.elapsed();
-
-            let len = records.len();
-            let nbytes = records.iter().fold(0, |acumulator, record| {
-                acumulator
-                    + match &record.1 {
-                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                    }
-            });
-
-            WAL_REDO_TIME.observe(duration.as_secs_f64());
-            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
-
-            debug!(
-                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-                len,
-                nbytes,
-                duration.as_micros(),
-                lsn
-            );
-
-            // If something went wrong, don't try to reuse the process. Kill it, and
-            // next request will launch a new one.
-            if let Err(e) = result.as_ref() {
-                error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                    records.len(),
-                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                    nbytes,
-                    base_img_lsn,
-                    lsn,
-                    n_attempts,
-                    e,
-                );
-                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
-                // Note that there may be other tasks concurrent with us that also hold `proc`.
-                // We have to deal with that here.
-                // Also read the doc comment on field `self.redo_process`.
-                //
-                // NB: there may still be other concurrent threads using `proc`.
-                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                //
-                // NB: the drop impl blocks the dropping thread with a wait() system call for
-                // the child process. In some ways the blocking is actually good: if we
-                // deferred the waiting into the background / to tokio if we used `tokio::process`,
-                // it could happen that if walredo always fails immediately, we spawn processes faster
-                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
-                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
-                // This probably needs revisiting at some later point.
-                match self.redo_process.get() {
-                    None => (),
-                    Some(guard) => {
-                        match &*guard {
-                            ProcessOnceCell::ManagerShutDown => {}
-                            ProcessOnceCell::Spawned(guard_proc) => {
-                                if Arc::ptr_eq(&proc, guard_proc) {
-                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                    guard.take_and_deinit();
-                                } else {
-                                    // Another task already spawned another redo process (further up in this method)
-                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
-                                }
-                            }
+                let len = records.len();
+                let nbytes = records.iter().fold(0, |acumulator, record| {
+                    acumulator
+                        + match &record.1 {
+                            NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                            _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
                         }
-                    }
+                });
+
+                WAL_REDO_TIME.observe(duration.as_secs_f64());
+                WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+                WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+
+                debug!(
+                    "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+                    len,
+                    nbytes,
+                    duration.as_micros(),
+                    lsn
+                );
+
+                if let Err(e) = result.as_ref() {
+                    error!(
+                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                        records.len(),
+                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                        nbytes,
+                        base_img_lsn,
+                        lsn,
+                        n_attempts,
+                        e,
+                    );
                 }
-                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
-                drop(proc);
-            } else if n_attempts != 0 {
+
+                result.map_err(Error::Other)
+            };
+            let result = self.do_with_walredo_process(pg_version, closure).await;
+
+            if result.is_ok() && n_attempts != 0 {
                 info!(n_attempts, "retried walredo succeeded");
             }
             n_attempts += 1;
             if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result.map_err(Error::Other);
+                return result;
             }
         }
     }

From 135e7e4306978d79a133086abb81707d8d9fb8a9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 18 Sep 2024 09:10:27 +0200
Subject: [PATCH 17/49] add `neon_local` subcommand for the broker & use that
 from regression tests (#8948)

There's currently no way to just start/stop broker from `neon_local`.

This PR
* adds a sub-command
* uses that sub-command from the test suite instead of the pre-existing
Python `subprocess` based approach.

Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.
---
 control_plane/src/bin/neon_local.rs           | 40 ++++++++++
 test_runner/fixtures/broker.py                | 63 ---------------
 test_runner/fixtures/neon_fixtures.py         | 76 +++++++++++++------
 test_runner/regress/test_neon_cli.py          |  2 +
 .../regress/test_pageserver_generations.py    |  2 +-
 .../regress/test_storage_controller.py        |  6 +-
 test_runner/regress/test_wal_acceptor.py      |  9 +--
 7 files changed, 99 insertions(+), 99 deletions(-)
 delete mode 100644 test_runner/fixtures/broker.py

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 144cd647c9..1c94c42865 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -106,6 +106,7 @@ fn main() -> Result<()> {
             "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
+            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
             "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
             "mappings" => handle_mappings(sub_args, &mut env),
@@ -1245,6 +1246,32 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
+async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match sub_match.subcommand() {
+        Some(broker_command_data) => broker_command_data,
+        None => bail!("no broker subcommand provided"),
+    };
+
+    match sub_name {
+        "start" => {
+            if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
+                eprintln!("broker start failed: {e}");
+                exit(1);
+            }
+        }
+
+        "stop" => {
+            if let Err(e) = broker::stop_broker_process(env) {
+                eprintln!("broker stop failed: {e}");
+                exit(1);
+            }
+        }
+
+        _ => bail!("Unexpected broker subcommand '{}'", sub_name),
+    }
+    Ok(())
+}
+
 async fn handle_start_all(
     env: &local_env::LocalEnv,
     retry_timeout: &Duration,
@@ -1672,6 +1699,19 @@ fn cli() -> Command {
                             .arg(stop_mode_arg.clone())
                             .arg(instance_id))
         )
+        .subcommand(
+            Command::new("storage_broker")
+                .arg_required_else_help(true)
+                .about("Manage broker")
+                .subcommand(Command::new("start")
+                            .about("Start broker")
+                            .arg(timeout_arg.clone())
+                )
+                .subcommand(Command::new("stop")
+                            .about("Stop broker")
+                            .arg(stop_mode_arg.clone())
+                )
+        )
         .subcommand(
             Command::new("safekeeper")
                 .arg_required_else_help(true)
diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py
deleted file mode 100644
index 8aca90a097..0000000000
--- a/test_runner/fixtures/broker.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import subprocess
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Optional
-
-from fixtures.log_helper import log
-
-
-@dataclass
-class NeonBroker:
-    """An object managing storage_broker instance"""
-
-    logfile: Path
-    port: int
-    neon_binpath: Path
-    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
-
-    def listen_addr(self):
-        return f"127.0.0.1:{self.port}"
-
-    def client_url(self):
-        return f"http://{self.listen_addr()}"
-
-    def check_status(self):
-        return True  # TODO
-
-    def try_start(self):
-        if self.handle is not None:
-            log.debug(f"storage_broker is already running on port {self.port}")
-            return
-
-        listen_addr = self.listen_addr()
-        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
-        with open(self.logfile, "wb") as logfile:
-            args = [
-                str(self.neon_binpath / "storage_broker"),
-                f"--listen-addr={listen_addr}",
-            ]
-            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
-
-        # wait for start
-        started_at = time.time()
-        while True:
-            try:
-                self.check_status()
-            except Exception as e:
-                elapsed = time.time() - started_at
-                if elapsed > 5:
-                    raise RuntimeError(
-                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
-                    ) from e
-                time.sleep(0.5)
-            else:
-                break  # success
-
-    def stop(self, immediate: bool = False):
-        if self.handle is not None:
-            if immediate:
-                self.handle.kill()
-            else:
-                self.handle.terminate()
-            self.handle.wait()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 69284bfdfc..cbbb162cc6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -60,7 +60,6 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.auth_tokens import AuthKeys, TokenScope
-from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
@@ -230,21 +229,6 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib
     return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)
 
 
-@pytest.fixture(scope="function")
-def default_broker(
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    neon_binpath: Path,
-) -> Iterator[NeonBroker]:
-    # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
-    client_port = port_distributor.get_port()
-    broker_logfile = test_output_dir / "repo" / "storage_broker.log"
-
-    broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
-    yield broker
-    broker.stop()
-
-
 @pytest.fixture(scope="session")
 def run_id() -> Iterator[uuid.UUID]:
     yield uuid.uuid4()
@@ -387,7 +371,6 @@ class NeonEnvBuilder:
         self,
         repo_dir: Path,
         port_distributor: PortDistributor,
-        broker: NeonBroker,
         run_id: uuid.UUID,
         mock_s3_server: MockS3Server,
         neon_binpath: Path,
@@ -428,7 +411,6 @@ class NeonEnvBuilder:
         # Safekeepers remote storage
         self.safekeepers_remote_storage: Optional[RemoteStorage] = None
 
-        self.broker = broker
         self.run_id = run_id
         self.mock_s3_server: MockS3Server = mock_s3_server
         self.pageserver_config_override = pageserver_config_override
@@ -940,6 +922,8 @@ class NeonEnvBuilder:
 
             self.env.storage_controller.assert_no_errors()
 
+            self.env.broker.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -993,7 +977,7 @@ class NeonEnv:
         self.endpoints = EndpointFactory(self)
         self.safekeepers: List[Safekeeper] = []
         self.pageservers: List[NeonPageserver] = []
-        self.broker = config.broker
+        self.broker = NeonBroker(self)
         self.pageserver_remote_storage = config.pageserver_remote_storage
         self.safekeepers_remote_storage = config.safekeepers_remote_storage
         self.pg_version = config.pg_version
@@ -1168,7 +1152,7 @@ class NeonEnv:
             max_workers=2 + len(self.pageservers) + len(self.safekeepers)
         ) as executor:
             futs.append(
-                executor.submit(lambda: self.broker.try_start() or None)
+                executor.submit(lambda: self.broker.start() or None)
             )  # The `or None` is for the linter
 
             for pageserver in self.pageservers:
@@ -1225,7 +1209,7 @@ class NeonEnv:
                 pageserver.stop(immediate=immediate)
             except RuntimeError:
                 stop_later.append(pageserver)
-        self.broker.stop(immediate=immediate)
+        self.broker.stop()
 
         # TODO: for nice logging we need python 3.11 ExceptionGroup
         for ps in stop_later:
@@ -1339,7 +1323,6 @@ def neon_simple_env(
     pytestconfig: Config,
     port_distributor: PortDistributor,
     mock_s3_server: MockS3Server,
-    default_broker: NeonBroker,
     run_id: uuid.UUID,
     top_output_dir: Path,
     test_output_dir: Path,
@@ -1364,7 +1347,6 @@ def neon_simple_env(
         top_output_dir=top_output_dir,
         repo_dir=repo_dir,
         port_distributor=port_distributor,
-        broker=default_broker,
         mock_s3_server=mock_s3_server,
         neon_binpath=neon_binpath,
         pg_distrib_dir=pg_distrib_dir,
@@ -1392,7 +1374,6 @@ def neon_env_builder(
     neon_binpath: Path,
     pg_distrib_dir: Path,
     pg_version: PgVersion,
-    default_broker: NeonBroker,
     run_id: uuid.UUID,
     request: FixtureRequest,
     test_overlay_dir: Path,
@@ -1428,7 +1409,6 @@ def neon_env_builder(
         neon_binpath=neon_binpath,
         pg_distrib_dir=pg_distrib_dir,
         pg_version=pg_version,
-        broker=default_broker,
         run_id=run_id,
         preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")),
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
@@ -1850,6 +1830,18 @@ class NeonCli(AbstractNeonCli):
             args.extend(["-m", "immediate"])
         return self.raw_cli(args)
 
+    def broker_start(
+        self, timeout_in_seconds: Optional[int] = None
+    ) -> "subprocess.CompletedProcess[str]":
+        cmd = ["storage_broker", "start"]
+        if timeout_in_seconds is not None:
+            cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        return self.raw_cli(cmd)
+
+    def broker_stop(self) -> "subprocess.CompletedProcess[str]":
+        cmd = ["storage_broker", "stop"]
+        return self.raw_cli(cmd)
+
     def endpoint_create(
         self,
         branch_name: str,
@@ -4573,6 +4565,40 @@ class Safekeeper(LogUtils):
         wait_until(20, 0.5, paused)
 
 
+class NeonBroker(LogUtils):
+    """An object managing storage_broker instance"""
+
+    def __init__(self, env: NeonEnv):
+        super().__init__(logfile=env.repo_dir / "storage_broker.log")
+        self.env = env
+        self.port: int = self.env.port_distributor.get_port()
+        self.running = False
+
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+    ):
+        assert not self.running
+        self.env.neon_cli.broker_start(timeout_in_seconds)
+        self.running = True
+        return self
+
+    def stop(self):
+        if self.running:
+            self.env.neon_cli.broker_stop()
+            self.running = False
+        return self
+
+    def listen_addr(self):
+        return f"127.0.0.1:{self.port}"
+
+    def client_url(self):
+        return f"http://{self.listen_addr()}"
+
+    def assert_no_errors(self):
+        assert_no_errors(self.logfile, "storage_controller", [])
+
+
 # TODO: Replace with `StrEnum` when we upgrade to python 3.11
 class NodeKind(str, Enum):
     PAGESERVER = "pageserver"
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index ba170cfb4c..b65430ff49 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,6 +134,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.broker_stop()
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
     env.pageserver.running = False
@@ -176,6 +177,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
 
     # Stop this to get out of the way of the following `start`
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.broker_stop()
 
     # Default start
     res = env.neon_cli.raw_cli(["start"])
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index ebf58d2bd1..c923713432 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -134,7 +134,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     )
 
     env = neon_env_builder.init_configs()
-    env.broker.try_start()
+    env.broker.start()
     for sk in env.safekeepers:
         sk.start()
     env.storage_controller.start()
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 2d72dbb2df..dc90a6e9a0 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -69,7 +69,7 @@ def test_storage_controller_smoke(
     env = neon_env_builder.init_configs()
 
     # Start services by hand so that we can skip a pageserver (this will start + register later)
-    env.broker.try_start()
+    env.broker.start()
     env.storage_controller.start()
     env.pageservers[0].start()
     env.pageservers[1].start()
@@ -292,7 +292,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
 
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
-    env.broker.try_start()
+    env.broker.start()
     env.storage_controller.start()
 
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
@@ -2126,7 +2126,7 @@ def start_env(env: NeonEnv, storage_controller_port: int):
         max_workers=2 + len(env.pageservers) + len(env.safekeepers)
     ) as executor:
         futs.append(
-            executor.submit(lambda: env.broker.try_start() or None)
+            executor.submit(lambda: env.broker.start() or None)
         )  # The `or None` is for the linter
 
         for pageserver in env.pageservers:
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 50fac441c0..4bf8cfe88f 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -19,7 +19,6 @@ import psycopg2.errors
 import psycopg2.extras
 import pytest
 import requests
-from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
@@ -1439,11 +1438,7 @@ class SafekeeperEnv:
     ):
         self.repo_dir = repo_dir
         self.port_distributor = port_distributor
-        self.broker = NeonBroker(
-            logfile=Path(self.repo_dir) / "storage_broker.log",
-            port=self.port_distributor.get_port(),
-            neon_binpath=neon_binpath,
-        )
+        self.fake_broker_endpoint = f"http://127.0.0.1:{port_distributor.get_port()}"
         self.pg_bin = pg_bin
         self.num_safekeepers = num_safekeepers
         self.bin_safekeeper = str(neon_binpath / "safekeeper")
@@ -1492,7 +1487,7 @@ class SafekeeperEnv:
             "--id",
             str(i),
             "--broker-endpoint",
-            self.broker.client_url(),
+            self.fake_broker_endpoint,
         ]
         log.info(f'Running command "{" ".join(cmd)}"')
 

From 3454ef7507000f05707418324dad392b366f8de3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 13:16:51 +0300
Subject: [PATCH 18/49] Refactor ImageLayerWriter to avoid passing a Timeline
 to finish() (#9028)

Commit ca5390a89d made a similar change to DeltaLayerWriter.

We bumped into this with Stas with our hackathon project, to create a
standalong program to create image layers directly from a Postgres data
directory. It needs to create image layers without having a Timeline and
other pageserver machinery.

This downgrades the "created image layer {}" message from INFO to TRACE
level. TRACE is used for the corresponding message on delta layer
creation too. The path logged in the message is now the temporary path,
before the file is renamed to its final name. Again commit ca5390a89d
made the same change for the message on delta layer creation.
---
 .../src/tenant/storage_layer/image_layer.rs   | 43 ++++++++-----------
 .../src/tenant/storage_layer/split_writer.rs  | 16 +++----
 pageserver/src/tenant/timeline.rs             |  9 ++--
 pageserver/src/tenant/timeline/compaction.rs  |  6 ++-
 4 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 875e223c9c..5de2582ab7 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,7 +38,7 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
     BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -58,7 +58,6 @@ use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -70,9 +69,7 @@ use utils::{
 };
 
 use super::layer_name::ImageLayerName;
-use super::{
-    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
 
 ///
 /// Header stored in the beginning of the file
@@ -800,10 +797,9 @@ impl ImageLayerWriterInner {
     ///
     async fn finish(
         self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Option<Key>,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -879,12 +875,9 @@ impl ImageLayerWriterInner {
         // fsync the file
         file.sync_all().await?;
 
-        // FIXME: why not carry the virtualfile here, it supports renaming?
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created image layer {}", self.path);
 
-        info!("created image layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
     }
 }
 
@@ -963,24 +956,18 @@ impl ImageLayerWriter {
     ///
     pub(crate) async fn finish(
         mut self,
-        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline, ctx, None).await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, None).await
     }
 
     /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
     pub(super) async fn finish_with_end_key(
         mut self,
-        timeline: &Arc<Timeline>,
         end_key: Key,
         ctx: &RequestContext,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(timeline, ctx, Some(end_key))
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(ctx, Some(end_key)).await
     }
 }
 
@@ -1084,7 +1071,7 @@ mod test {
         tenant::{
             config::TenantConf,
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::ResidentLayer,
+            storage_layer::{Layer, ResidentLayer},
             vectored_blob_io::StreamingVectoredReadPlanner,
             Tenant, Timeline,
         },
@@ -1155,7 +1142,8 @@ mod test {
 
                 key = key.next();
             }
-            writer.finish(&timeline, &ctx).await.unwrap()
+            let (desc, path) = writer.finish(&ctx).await.unwrap();
+            Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
         };
         let original_size = resident.metadata().file_size;
 
@@ -1217,7 +1205,9 @@ mod test {
                 .await
                 .unwrap();
             let replacement = if wrote_keys > 0 {
-                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+                let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
+                let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
+                Some(resident)
             } else {
                 None
             };
@@ -1290,7 +1280,8 @@ mod test {
         for (key, img) in images {
             writer.put_image(key, img, ctx).await?;
         }
-        let img_layer = writer.finish(tline, ctx).await?;
+        let (desc, path) = writer.finish(ctx).await?;
+        let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
 
         Ok::<_, anyhow::Error>(img_layer)
     }
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 40a6a77a50..b499a0eef4 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
                 self.generated_layers
                     .push(SplitWriterResult::Discarded(layer_key));
             } else {
-                self.generated_layers.push(SplitWriterResult::Produced(
-                    prev_image_writer
-                        .finish_with_end_key(tline, key, ctx)
-                        .await?,
-                ));
+                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
+
+                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                self.generated_layers
+                    .push(SplitWriterResult::Produced(layer));
             }
         }
         self.inner.put_image(key, img, ctx).await
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
         if discard(&layer_key).await {
             generated_layers.push(SplitWriterResult::Discarded(layer_key));
         } else {
-            generated_layers.push(SplitWriterResult::Produced(
-                inner.finish_with_end_key(tline, end_key, ctx).await?,
-            ));
+            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
+            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(layer));
         }
         Ok(generated_layers)
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 262dccac7d..f66491d962 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4013,7 +4013,8 @@ impl Timeline {
         if wrote_keys {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -4101,7 +4102,8 @@ impl Timeline {
         if wrote_any_image {
             // Normal path: we have written some data into the new image layer for this
             // partition, so flush it to disk.
-            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            let (desc, path) = image_layer_writer.finish(ctx).await?;
+            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -5403,7 +5405,8 @@ impl Timeline {
         for (key, img) in images {
             image_layer_writer.put_image(key, img, ctx).await?;
         }
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        let (desc, path) = image_layer_writer.finish(ctx).await?;
+        let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
 
         {
             let mut guard = self.layers.write().await;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 0b5c520ba7..d1f06e3480 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -563,10 +563,12 @@ impl Timeline {
                 .await?;
 
             if keys_written > 0 {
-                let new_layer = image_layer_writer
-                    .finish(self, ctx)
+                let (desc, path) = image_layer_writer
+                    .finish(ctx)
                     .await
                     .map_err(CompactionError::Other)?;
+                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .map_err(CompactionError::Other)?;
                 tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                     layer.metadata().file_size,
                     new_layer.metadata().file_size);

From c5cd8577ff6d96e8153dd22af17373c4351e52e4 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 18 Sep 2024 13:58:51 +0200
Subject: [PATCH 19/49] proxy: make sql-over-http max request/response sizes
 configurable (#9029)

---
 proxy/src/bin/local_proxy.rs          |   8 ++
 proxy/src/bin/proxy.rs                |   8 ++
 proxy/src/config.rs                   |   2 +
 proxy/src/serverless/conn_pool.rs     |   2 +
 proxy/src/serverless/sql_over_http.rs | 114 +++++++++++++++-----------
 5 files changed, 85 insertions(+), 49 deletions(-)

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 6eba71df1b..94365ddf05 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -92,6 +92,12 @@ struct SqlOverHttpArgs {
 
     #[clap(long, default_value_t = 16)]
     sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: u64,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
 }
 
 #[tokio::main]
@@ -208,6 +214,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
         },
         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     };
 
     Ok(Box::leak(Box::new(ProxyConfig {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ca9aeb04d8..e5c5b47795 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -268,6 +268,12 @@ struct SqlOverHttpArgs {
 
     #[clap(long, default_value_t = 64)]
     sql_over_http_cancel_set_shards: usize,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_request_size_bytes: u64,
+
+    #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
+    sql_over_http_max_response_size_bytes: usize,
 }
 
 #[tokio::main]
@@ -679,6 +685,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         },
         cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+        max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes,
+        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
     };
     let authentication_config = AuthenticationConfig {
         thread_pool,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1cda6d200c..373e4cf650 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -56,6 +56,8 @@ pub struct HttpConfig {
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
+    pub max_request_size_bytes: u64,
+    pub max_response_size_bytes: usize,
 }
 
 pub struct AuthenticationConfig {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index bea599e9b9..6c32d5df0e 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -776,6 +776,8 @@ mod tests {
             },
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
+            max_request_size_bytes: u64::MAX,
+            max_response_size_bytes: usize::MAX,
         }));
         let pool = GlobalConnPool::new(config);
         let conn_info = ConnInfo {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 2188edc8c5..06e540d149 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -87,9 +87,6 @@ enum Payload {
     Batch(BatchQueryData),
 }
 
-const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
-const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-
 static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -366,10 +363,10 @@ pub(crate) enum SqlOverHttpError {
     ConnectCompute(#[from] HttpConnError),
     #[error("{0}")]
     ConnInfo(#[from] ConnInfoError),
-    #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")]
-    RequestTooLarge,
-    #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")]
-    ResponseTooLarge,
+    #[error("request is too large (max is {0} bytes)")]
+    RequestTooLarge(u64),
+    #[error("response is too large (max is {0} bytes)")]
+    ResponseTooLarge(usize),
     #[error("invalid isolation level")]
     InvalidIsolationLevel,
     #[error("{0}")]
@@ -386,8 +383,8 @@ impl ReportableError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
             SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
             SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
-            SqlOverHttpError::RequestTooLarge => ErrorKind::User,
-            SqlOverHttpError::ResponseTooLarge => ErrorKind::User,
+            SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User,
+            SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
             SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
             SqlOverHttpError::Postgres(p) => p.get_error_kind(),
             SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres,
@@ -402,8 +399,8 @@ impl UserFacingError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(p) => p.to_string(),
             SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
             SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
-            SqlOverHttpError::RequestTooLarge => self.to_string(),
-            SqlOverHttpError::ResponseTooLarge => self.to_string(),
+            SqlOverHttpError::RequestTooLarge(_) => self.to_string(),
+            SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
             SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
             SqlOverHttpError::Postgres(p) => p.to_string(),
             SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(),
@@ -537,7 +534,7 @@ async fn handle_inner(
 
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
-        None => MAX_REQUEST_SIZE + 1,
+        None => config.http_config.max_request_size_bytes + 1,
     };
     info!(request_content_length, "request size in bytes");
     Metrics::get()
@@ -547,8 +544,10 @@ async fn handle_inner(
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
-    if request_content_length > MAX_REQUEST_SIZE {
-        return Err(SqlOverHttpError::RequestTooLarge);
+    if request_content_length > config.http_config.max_request_size_bytes {
+        return Err(SqlOverHttpError::RequestTooLarge(
+            config.http_config.max_request_size_bytes,
+        ));
     }
 
     let fetch_and_process_request = Box::pin(
@@ -612,7 +611,10 @@ async fn handle_inner(
 
     // Now execute the query and return the result.
     let json_output = match payload {
-        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
+        Payload::Single(stmt) => {
+            stmt.process(config, cancel, &mut client, parsed_headers)
+                .await?
+        }
         Payload::Batch(statements) => {
             if parsed_headers.txn_read_only {
                 response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
@@ -628,7 +630,7 @@ async fn handle_inner(
             }
 
             statements
-                .process(cancel, &mut client, parsed_headers)
+                .process(config, cancel, &mut client, parsed_headers)
                 .await?
         }
     };
@@ -656,6 +658,7 @@ async fn handle_inner(
 impl QueryData {
     async fn process(
         self,
+        config: &'static ProxyConfig,
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
@@ -664,7 +667,7 @@ impl QueryData {
         let cancel_token = inner.cancel_token();
 
         let res = match select(
-            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
+            pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)),
             pin!(cancel.cancelled()),
         )
         .await
@@ -727,6 +730,7 @@ impl QueryData {
 impl BatchQueryData {
     async fn process(
         self,
+        config: &'static ProxyConfig,
         cancel: CancellationToken,
         client: &mut Client<tokio_postgres::Client>,
         parsed_headers: HttpHeaders,
@@ -751,44 +755,52 @@ impl BatchQueryData {
             discard.discard();
         })?;
 
-        let json_output =
-            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
-                Ok(json_output) => {
-                    info!("commit");
-                    let status = transaction.commit().await.inspect_err(|_| {
-                        // if we cannot commit - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                    })?;
-                    discard.check_idle(status);
-                    json_output
-                }
-                Err(SqlOverHttpError::Cancelled(_)) => {
-                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                        tracing::error!(?err, "could not cancel query");
-                    }
-                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+        let json_output = match query_batch(
+            config,
+            cancel.child_token(),
+            &transaction,
+            self,
+            parsed_headers,
+        )
+        .await
+        {
+            Ok(json_output) => {
+                info!("commit");
+                let status = transaction.commit().await.inspect_err(|_| {
+                    // if we cannot commit - for now don't return connection to pool
+                    // TODO: get a query status from the error
                     discard.discard();
+                })?;
+                discard.check_idle(status);
+                json_output
+            }
+            Err(SqlOverHttpError::Cancelled(_)) => {
+                if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                    tracing::error!(?err, "could not cancel query");
+                }
+                // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
+                discard.discard();
 
-                    return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-                }
-                Err(err) => {
-                    info!("rollback");
-                    let status = transaction.rollback().await.inspect_err(|_| {
-                        // if we cannot rollback - for now don't return connection to pool
-                        // TODO: get a query status from the error
-                        discard.discard();
-                    })?;
-                    discard.check_idle(status);
-                    return Err(err);
-                }
-            };
+                return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
+            }
+            Err(err) => {
+                info!("rollback");
+                let status = transaction.rollback().await.inspect_err(|_| {
+                    // if we cannot rollback - for now don't return connection to pool
+                    // TODO: get a query status from the error
+                    discard.discard();
+                })?;
+                discard.check_idle(status);
+                return Err(err);
+            }
+        };
 
         Ok(json_output)
     }
 }
 
 async fn query_batch(
+    config: &'static ProxyConfig,
     cancel: CancellationToken,
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
@@ -798,6 +810,7 @@ async fn query_batch(
     let mut current_size = 0;
     for stmt in queries.queries {
         let query = pin!(query_to_json(
+            config,
             transaction,
             stmt,
             &mut current_size,
@@ -826,6 +839,7 @@ async fn query_batch(
 }
 
 async fn query_to_json<T: GenericClient>(
+    config: &'static ProxyConfig,
     client: &T,
     data: QueryData,
     current_size: &mut usize,
@@ -846,8 +860,10 @@ async fn query_to_json<T: GenericClient>(
         rows.push(row);
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
-        if *current_size > MAX_RESPONSE_SIZE {
-            return Err(SqlOverHttpError::ResponseTooLarge);
+        if *current_size > config.http_config.max_response_size_bytes {
+            return Err(SqlOverHttpError::ResponseTooLarge(
+                config.http_config.max_response_size_bytes,
+            ));
         }
     }
 

From e161a2fa424e392c74a5f76e8964227e11f72e9c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 18 Sep 2024 14:26:47 +0100
Subject: [PATCH 20/49] CI(deploy): fix deploy to staging and prod (#9030)

## Problem

It turns out the previous approach (with `skip_if` input) doesn't work
(from https://github.com/neondatabase/neon/pull/9017).
Revert it and use more straightforward if-conditions

## Summary of changes
- Revert efbe8db7f1bd7775f7dff276ee95fd187cd33c2c
- Add if-condition to`promote-compatibility-data` job and relevant
comments
---
 .github/workflows/_push-to-acr.yml   |  6 ------
 .github/workflows/build_and_test.yml | 10 ++++++----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml
index 7b6eba2c06..c304172ff7 100644
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -26,15 +26,9 @@ on:
         description: Azure tenant ID
         required: true
         type: string
-      skip_if:
-        description: Skip the job if this expression is true
-        required: true
-        type: boolean
 
 jobs:
   push-to-acr:
-    if: ${{ !inputs.skip_if }}
-
     runs-on: ubuntu-22.04
     permissions:
       contents: read  # This is required for actions/checkout
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d46b8dc1f5..a210c962cb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -849,6 +849,7 @@ jobs:
           done
 
   push-to-acr-dev:
+    if: github.ref_name == 'main'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -858,9 +859,9 @@ jobs:
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
-      skip_if: ${{ github.ref_name != 'main' }}
 
   push-to-acr-prod:
+    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -870,7 +871,6 @@ jobs:
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
-      skip_if: ${{ !startsWith(github.ref_name, 'release') }}
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
@@ -948,7 +948,8 @@ jobs:
 
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1046,7 +1047,8 @@ jobs:
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]
-    if: github.ref_name == 'release'
+    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
+    if: github.ref_name == 'release' && !failure() && !cancelled()
 
     runs-on: ubuntu-22.04
     steps:

From 2f37f0384c1d122fe23512be3b9518288f69b5fe Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 17 Sep 2024 12:28:29 -0500
Subject: [PATCH 21/49] Add v17 to revisions.json

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/revisions.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3a65a507f3..c2c34962bb 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,8 @@
 {
+  "v17": [
+    "17rc1",
+    "9156d63ce253bed9d1f76355ceec610e444eaffa"
+  ],
   "v16": [
     "16.4",
     "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"

From ac6a1151aeaf8fcc007b833a97dca57328c44d05 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 17 Sep 2024 19:08:47 +0100
Subject: [PATCH 22/49] test_postgres_version: reenable version check for
 prereleased versions

---
 test_runner/regress/test_postgres_version.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py
index 4145a303c6..d8626c15a5 100644
--- a/test_runner/regress/test_postgres_version.py
+++ b/test_runner/regress/test_postgres_version.py
@@ -30,9 +30,8 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion):
     version = match.group("version")
     commit = match.group("commit")
 
-    if "." in version:
-        assert (
-            pg_version.v_prefixed in expected_revisions
-        ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
-        msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
-        assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg
+    assert (
+        pg_version.v_prefixed in expected_revisions
+    ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional"
+    msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional"
+    assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg

From 794bd4b866a385a104e257deae438581b0fd250d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 18 Sep 2024 17:14:53 +0200
Subject: [PATCH 23/49] proxy: mock cplane usable without allowed-ips table
 (#9046)

---
 proxy/src/bin/proxy.rs             | 21 ++++++++-------
 proxy/src/console/provider/mock.rs | 42 ++++++++++++++++++------------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index e5c5b47795..2ac66ffe8c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -62,12 +62,13 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
     Console,
-    #[cfg(feature = "testing")]
-    Postgres,
     // clap only shows the name, not the alias, in usage text.
     // TODO: swap name/alias and deprecate "link"
     #[value(name("link"), alias("web"))]
     Web,
+
+    #[cfg(feature = "testing")]
+    Postgres,
 }
 
 /// Neon proxy/router
@@ -639,17 +640,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let api = console::provider::ConsoleBackend::Console(api);
             auth::Backend::Console(MaybeOwned::Owned(api), ())
         }
-        #[cfg(feature = "testing")]
-        AuthBackendType::Postgres => {
-            let url = args.auth_endpoint.parse()?;
-            let api = console::provider::mock::Api::new(url);
-            let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::Backend::Console(MaybeOwned::Owned(api), ())
-        }
+
         AuthBackendType::Web => {
             let url = args.uri.parse()?;
             auth::Backend::Web(MaybeOwned::Owned(url), ())
         }
+
+        #[cfg(feature = "testing")]
+        AuthBackendType::Postgres => {
+            let url = args.auth_endpoint.parse()?;
+            let api = console::provider::mock::Api::new(url, !args.is_private_access_proxy);
+            let api = console::provider::ConsoleBackend::Postgres(api);
+            auth::Backend::Console(MaybeOwned::Owned(api), ())
+        }
     };
 
     let config::ConcurrencyLockOptions {
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 08b87cd87a..1b77418de6 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -41,11 +41,15 @@ impl From<tokio_postgres::Error> for ApiError {
 #[derive(Clone)]
 pub struct Api {
     endpoint: ApiUrl,
+    ip_allowlist_check_enabled: bool,
 }
 
 impl Api {
-    pub fn new(endpoint: ApiUrl) -> Self {
-        Self { endpoint }
+    pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
+        Self {
+            endpoint,
+            ip_allowlist_check_enabled,
+        }
     }
 
     pub(crate) fn url(&self) -> &str {
@@ -64,6 +68,7 @@ impl Api {
                 tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
             tokio::spawn(connection);
+
             let secret = if let Some(entry) = get_execute_postgres_query(
                 &client,
                 "select rolpassword from pg_catalog.pg_authid where rolname = $1",
@@ -79,21 +84,26 @@ impl Api {
                 warn!("user '{}' does not exist", user_info.user);
                 None
             };
-            let allowed_ips = match get_execute_postgres_query(
-                &client,
-                "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
-                &[&user_info.endpoint.as_str()],
-                "allowed_ips",
-            )
-            .await?
-            {
-                Some(s) => {
-                    info!("got allowed_ips: {s}");
-                    s.split(',')
-                        .map(|s| IpPattern::from_str(s).unwrap())
-                        .collect()
+
+            let allowed_ips = if self.ip_allowlist_check_enabled {
+                match get_execute_postgres_query(
+                    &client,
+                    "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
+                    &[&user_info.endpoint.as_str()],
+                    "allowed_ips",
+                )
+                .await?
+                {
+                    Some(s) => {
+                        info!("got allowed_ips: {s}");
+                        s.split(',')
+                            .map(|s| IpPattern::from_str(s).unwrap())
+                            .collect()
+                    }
+                    None => vec![],
                 }
-                None => vec![],
+            } else {
+                vec![]
             };
 
             Ok((secret, allowed_ips))

From 035a49a6b2e9ffa35dbf94d78042c147e83e896f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 18 Sep 2024 18:17:55 +0200
Subject: [PATCH 24/49] `neon_local start`: parallel startup to break cyclic
 dependency (#8950)

(Found this useful during investigation
https://github.com/neondatabase/cloud/issues/16886.)

Problem
-------

Before this PR, `neon_local` sequentially does the following:
1. launch storcon process
2. wait for storcon to signal readiness
[here](https://github.com/neondatabase/neon/blob/75310fe441b87d399213e365f1364aa9f08aa40d/control_plane/src/storage_controller.rs#L804-L808)
3. start pageserver
4. wait for pageserver to become ready
[here](https://github.com/neondatabase/neon/blob/c43e664ff577d4568722e4e7a2b2c6267b609607/control_plane/src/pageserver.rs#L343-L346)
5. etc

The problem is that storcon's readiness waits for the
[`startup_reconcile`](https://github.com/neondatabase/neon/blob/cbcd4058edb7a2c2bb3bfe1a6fc1ffb0d820b870/storage_controller/src/service.rs#L520-L523)
to complete.

But pageservers aren't started at this point.

So, worst case we wait for `STARTUP_RECONCILE_TIMEOUT/2`, i.e., 15s.

This is more than the 10s default timeout allowed by neon_local.

So, the result is that `neon_local start` fails to start storcon and
stops everything.

Solution
--------

In this PR I choose the the radical solution to start everything in
parallel.

It junks up the output because we do stuff like `print!(".")` to
indicate progress.
We should just abandon that.
And switch to `utils::logging` + `tracing` with separate spans for each
component.
I can do that in this PR or we leave it as a follow-up.

Alternatives Considered
-----------------------

The Pageserver's `/v1/status` or in fact any endpoint of the mgmt API
will not `accept()` on the mgmt API socket until after the `re-attach`
call to storcon returned success.

So, it's insufficient to change the startup order to start Pageservers
first.

We cannot easily change Pageserver startup order because
`init_tenant_mgr` must complete before we start serving the mgmt API.
Otherwise tenant detach calls et al can race with `init_tenant_mgr`.

We'd have to add a "loading" state to tenant mgr and make all API
endpoints except `/v1/status` wait for _that_ to complete.


Related
-------

- https://github.com/neondatabase/neon/pull/6475
---
 control_plane/src/background_process.rs |   2 +-
 control_plane/src/bin/neon_local.rs     | 145 ++++++++++++++++--------
 control_plane/src/endpoint.rs           |   2 +-
 3 files changed, 99 insertions(+), 50 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 619c5bce3e..94a072e394 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -151,7 +151,7 @@ where
                     print!(".");
                     io::stdout().flush().unwrap();
                 }
-                thread::sleep(RETRY_INTERVAL);
+                tokio::time::sleep(RETRY_INTERVAL).await;
             }
             Err(e) => {
                 println!("error starting process {process_name:?}: {e:#}");
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 1c94c42865..92f609761a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -34,12 +34,14 @@ use safekeeper_api::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
+use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
+use tokio::task::JoinSet;
 use url::Host;
 use utils::{
     auth::{Claims, Scope},
@@ -87,35 +89,35 @@ fn main() -> Result<()> {
 
     // Check for 'neon init' command first.
     let subcommand_result = if sub_name == "init" {
-        handle_init(sub_args).map(Some)
+        handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
     } else {
         // all other commands need an existing config
-        let mut env =
-            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
-        let original_env = env.clone();
 
+        let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
+        let original_env = env.clone();
+        let env = Box::leak(Box::new(env));
         let rt = tokio::runtime::Builder::new_current_thread()
             .enable_all()
             .build()
             .unwrap();
 
         let subcommand_result = match sub_name {
-            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
-            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
-            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
-            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
-            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, &env)),
-            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
-            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
-            "mappings" => handle_mappings(sub_args, &mut env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, env)),
+            "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
+            "stop" => rt.block_on(handle_stop_all(sub_args, env)),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
+            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
+            "mappings" => handle_mappings(sub_args, env),
             "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
             _ => bail!("unexpected subcommand {sub_name}"),
         };
 
-        if original_env != env {
-            subcommand_result.map(|()| Some(env))
+        if &original_env != env {
+            subcommand_result.map(|()| Some(Cow::Borrowed(env)))
         } else {
             subcommand_result.map(|()| None)
         }
@@ -1273,48 +1275,95 @@ async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv
 }
 
 async fn handle_start_all(
-    env: &local_env::LocalEnv,
+    env: &'static local_env::LocalEnv,
     retry_timeout: &Duration,
 ) -> anyhow::Result<()> {
+    let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
+        neon_start_status_check(env, retry_timeout)
+            .await
+            .context("status check after successful startup of all services")?;
+        return Ok(());
+    };
+
+    eprintln!("startup failed because one or more services could not be started");
+
+    for e in errors {
+        eprintln!("{e}");
+        let debug_repr = format!("{e:?}");
+        for line in debug_repr.lines() {
+            eprintln!("  {line}");
+        }
+    }
+
+    try_stop_all(env, true).await;
+
+    exit(2);
+}
+
+/// Returns Ok() if and only if all services could be started successfully.
+/// Otherwise, returns the list of errors that occurred during startup.
+async fn handle_start_all_impl(
+    env: &'static local_env::LocalEnv,
+    retry_timeout: Duration,
+) -> Result<(), Vec<anyhow::Error>> {
     // Endpoints are not started automatically
 
-    broker::start_broker_process(env, retry_timeout).await?;
+    let mut js = JoinSet::new();
 
-    // Only start the storage controller if the pageserver is configured to need it
-    if env.control_plane_api.is_some() {
-        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller
-            .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                (*retry_timeout).into(),
-            ))
-            .await
-        {
-            eprintln!("storage_controller start failed: {:#}", e);
-            try_stop_all(env, true).await;
-            exit(1);
+    // force infalliblity through closure
+    #[allow(clippy::redundant_closure_call)]
+    (|| {
+        js.spawn(async move {
+            let retry_timeout = retry_timeout;
+            broker::start_broker_process(env, &retry_timeout).await
+        });
+
+        // Only start the storage controller if the pageserver is configured to need it
+        if env.control_plane_api.is_some() {
+            js.spawn(async move {
+                let storage_controller = StorageController::from_env(env);
+                storage_controller
+                    .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                        retry_timeout.into(),
+                    ))
+                    .await
+                    .map_err(|e| e.context("start storage_controller"))
+            });
+        }
+
+        for ps_conf in &env.pageservers {
+            js.spawn(async move {
+                let pageserver = PageServerNode::from_env(env, ps_conf);
+                pageserver
+                    .start(&retry_timeout)
+                    .await
+                    .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
+            });
+        }
+
+        for node in env.safekeepers.iter() {
+            js.spawn(async move {
+                let safekeeper = SafekeeperNode::from_env(env, node);
+                safekeeper
+                    .start(vec![], &retry_timeout)
+                    .await
+                    .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
+            });
+        }
+    })();
+
+    let mut errors = Vec::new();
+    while let Some(result) = js.join_next().await {
+        let result = result.expect("we don't panic or cancel the tasks");
+        if let Err(e) = result {
+            errors.push(e);
         }
     }
 
-    for ps_conf in &env.pageservers {
-        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(retry_timeout).await {
-            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true).await;
-            exit(1);
-        }
+    if !errors.is_empty() {
+        return Err(errors);
     }
 
-    for node in env.safekeepers.iter() {
-        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
-            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false).await;
-            exit(1);
-        }
-    }
-
-    neon_start_status_check(env, retry_timeout).await?;
-
     Ok(())
 }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 9f879c4b08..7554a03a68 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -702,7 +702,7 @@ impl Endpoint {
                     }
                 }
             }
-            std::thread::sleep(ATTEMPT_INTERVAL);
+            tokio::time::sleep(ATTEMPT_INTERVAL).await;
         }
 
         // disarm the scopeguard, let the child outlive this function (and neon_local invoction)

From 9d1c6f23d312f43ecd0d7134d7566020626548b2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 18 Sep 2024 14:13:57 -0400
Subject: [PATCH 25/49] fix(storage-scrubber): log version after initialize the
 logger (#9049)

When I checked the log in Grafana I couldn't find the scrubber version.
Then I realized that it should be logged after the logger gets
initialized.

## Summary of changes

Log after initializing the logger for the scrubber.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/main.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index c5961753c5..ee133e2e58 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -121,8 +121,6 @@ enum Command {
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
 
-    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
-
     let bucket_config = BucketConfig::from_env()?;
 
     let command_log_name = match &cli.command {
@@ -142,6 +140,8 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
+    tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
+
     let controller_client = cli.controller_api.map(|controller_api| {
         ControllerClientConfig {
             controller_api,

From 728b79b9ddc9987315e56085b12c5a4c46a7e0f4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 00:12:13 +0300
Subject: [PATCH 26/49] Remove some unnecessary derives

---
 libs/consumption_metrics/src/lib.rs    | 4 ++--
 libs/pageserver_api/src/models.rs      | 2 +-
 libs/postgres_ffi/wal_craft/src/lib.rs | 1 -
 safekeeper/src/pull_timeline.rs        | 4 ++--
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 810196aff6..fbe2e6830f 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 
-#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
     #[serde(rename = "absolute")]
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
 
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize, Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
     pub events: std::borrow::Cow<'a, [T]>,
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 45e84baa1f..c9be53f0b0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -495,7 +495,7 @@ pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
     #[serde(rename_all = "snake_case")]
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 949e3f4251..ddaafe65f1 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -26,7 +26,6 @@ macro_rules! xlog_utils_test {
 
 postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
 
-#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
     pub pg_version: u32,
     pub pg_distrib_dir: PathBuf,
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 64585f5edc..c772ae6de7 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -278,7 +278,7 @@ impl WalResidentTimeline {
 }
 
 /// pull_timeline request body.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct Request {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
@@ -293,7 +293,7 @@ pub struct Response {
 }
 
 /// Response for debug dump request.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct DebugDumpResponse {
     pub start_time: DateTime<Utc>,
     pub finish_time: DateTime<Utc>,

From 15ae1fc3df41ba9f1e5093eaf6b8e7b91e32700f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:38:26 +0300
Subject: [PATCH 27/49] Remove a few postgres constants that were not used

Dead code is generally useless, but with Postgres constants in
particular, I'm also worried that if they're not used anywhere, we
might fail to update them at a Postgres version update, and get very
confused later when they have wrong values.
---
 libs/postgres_ffi/src/pg_constants.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 61b49a634d..497d011d7a 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -9,8 +9,8 @@
 //! comments on them.
 //!
 
+use crate::PageHeaderData;
 use crate::BLCKSZ;
-use crate::{PageHeaderData, XLogRecord};
 
 //
 // From pg_tablespace_d.h
@@ -194,8 +194,6 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;
 
-pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
-
 //
 // from xlogrecord.h
 //
@@ -219,8 +217,6 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
 /* From transam.h */
 pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
 pub const INVALID_TRANSACTION_ID: u32 = 0;
-pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
-pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
 
 /* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;

From 7b34c2d7af5c67f413cc93b16ca6dbe6932072f2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:39:28 +0300
Subject: [PATCH 28/49] Remove misc dead code in libs/

---
 libs/postgres_backend/src/lib.rs | 10 ----------
 libs/remote_storage/src/lib.rs   |  4 ----
 libs/utils/src/vec_map.rs        | 26 --------------------------
 3 files changed, 40 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 8ea4b93fb1..e274d24585 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -280,16 +280,6 @@ pub struct PostgresBackend<IO> {
 
 pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
 
-pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
-    let mut query_string = query_string.to_vec();
-    if let Some(ch) = query_string.last() {
-        if *ch == 0 {
-            query_string.pop();
-        }
-    }
-    query_string
-}
-
 /// Cast a byte slice to a string slice, dropping null terminator if there's one.
 fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b5b69c9faf..45267ccda9 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -127,10 +127,6 @@ impl RemotePath {
         &self.0
     }
 
-    pub fn extension(&self) -> Option<&str> {
-        self.0.extension()
-    }
-
     pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
         self.0.strip_prefix(&p.0)
     }
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 5f0028bacd..1fe048c6f0 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -120,32 +120,6 @@ impl<K: Ord, V> VecMap<K, V> {
         Ok((None, delta_size))
     }
 
-    /// Split the map into two.
-    ///
-    /// The left map contains everything before `cutoff` (exclusive).
-    /// Right map contains `cutoff` and everything after (inclusive).
-    pub fn split_at(&self, cutoff: &K) -> (Self, Self)
-    where
-        K: Clone,
-        V: Clone,
-    {
-        let split_idx = self
-            .data
-            .binary_search_by_key(&cutoff, extract_key)
-            .unwrap_or_else(std::convert::identity);
-
-        (
-            VecMap {
-                data: self.data[..split_idx].to_vec(),
-                ordering: self.ordering,
-            },
-            VecMap {
-                data: self.data[split_idx..].to_vec(),
-                ordering: self.ordering,
-            },
-        )
-    }
-
     /// Move items from `other` to the end of `self`, leaving `other` empty.
     /// If the `other` ordering is different from `self` ordering
     /// `ExtendOrderingError` error will be returned.

From 5da2340e740c96ac3f0da110c52c62f184a5c92c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:05:19 +0300
Subject: [PATCH 29/49] Remove misc dead code in control_plane/

---
 control_plane/src/pageserver.rs | 43 +--------------------------------
 1 file changed, 1 insertion(+), 42 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 33ca70af96..cae9416af6 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,9 +17,7 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
-};
+use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -324,22 +322,6 @@ impl PageServerNode {
         background_process::stop_process(immediate, "pageserver", &self.pid_file())
     }
 
-    pub async fn page_server_psql_client(
-        &self,
-    ) -> anyhow::Result<(
-        tokio_postgres::Client,
-        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
-    )> {
-        let mut config = self.pg_connection_config.clone();
-        if self.conf.pg_auth_type == AuthType::NeonJWT {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-            config = config.set_password(Some(token));
-        }
-        Ok(config.connect_no_tls().await?)
-    }
-
     pub async fn check_status(&self) -> mgmt_api::Result<()> {
         self.http_client.status().await
     }
@@ -540,19 +522,6 @@ impl PageServerNode {
         Ok(())
     }
 
-    pub async fn location_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-        config: LocationConfig,
-        flush_ms: Option<Duration>,
-        lazy: bool,
-    ) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .location_config(tenant_shard_id, config, flush_ms, lazy)
-            .await?)
-    }
-
     pub async fn timeline_list(
         &self,
         tenant_shard_id: &TenantShardId,
@@ -636,14 +605,4 @@ impl PageServerNode {
 
         Ok(())
     }
-
-    pub async fn tenant_synthetic_size(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> anyhow::Result<TenantHistorySize> {
-        Ok(self
-            .http_client
-            .tenant_synthetic_size(tenant_shard_id)
-            .await?)
-    }
 }

From 2d4e5af18be812523e24133656523bb3e8ee9ecb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:04:57 +0300
Subject: [PATCH 30/49] Remove unused code for parsing a postgresql.conf file

---
 control_plane/src/postgresql_conf.rs | 121 +--------------------------
 1 file changed, 1 insertion(+), 120 deletions(-)

diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs
index 638575eb82..5aee12dc97 100644
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -4,13 +4,10 @@
 /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
 /// enough to extract a few settings we need in Neon, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
-use anyhow::{bail, Context, Result};
 use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashMap;
 use std::fmt;
-use std::io::BufRead;
-use std::str::FromStr;
 
 /// In-memory representation of a postgresql.conf file
 #[derive(Default, Debug)]
@@ -19,84 +16,16 @@ pub struct PostgresConf {
     hash: HashMap<String, String>,
 }
 
-static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
-
 impl PostgresConf {
     pub fn new() -> PostgresConf {
         PostgresConf::default()
     }
 
-    /// Read file into memory
-    pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
-        let mut result = Self::new();
-
-        for line in std::io::BufReader::new(read).lines() {
-            let line = line?;
-
-            // Store each line in a vector, in original format
-            result.lines.push(line.clone());
-
-            // Also parse each line and insert key=value lines into a hash map.
-            //
-            // FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
-            // But it's close enough for our usage.
-            let line = line.trim();
-            if line.starts_with('#') {
-                // comment, ignore
-                continue;
-            } else if let Some(caps) = CONF_LINE_RE.captures(line) {
-                let name = caps.get(1).unwrap().as_str();
-                let raw_val = caps.get(2).unwrap().as_str();
-
-                if let Ok(val) = deescape_str(raw_val) {
-                    // Note: if there's already an entry in the hash map for
-                    // this key, this will replace it. That's the behavior what
-                    // we want; when PostgreSQL reads the file, each line
-                    // overrides any previous value for the same setting.
-                    result.hash.insert(name.to_string(), val.to_string());
-                }
-            }
-        }
-        Ok(result)
-    }
-
     /// Return the current value of 'option'
     pub fn get(&self, option: &str) -> Option<&str> {
         self.hash.get(option).map(|x| x.as_ref())
     }
 
-    /// Return the current value of a field, parsed to the right datatype.
-    ///
-    /// This calls the FromStr::parse() function on the value of the field. If
-    /// the field does not exist, or parsing fails, returns an error.
-    ///
-    pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
-    where
-        T: FromStr,
-        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
-    {
-        self.get(field_name)
-            .with_context(|| format!("could not find '{}' option {}", field_name, context))?
-            .parse::<T>()
-            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
-    }
-
-    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
-    where
-        T: FromStr,
-        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
-    {
-        if let Some(val) = self.get(field_name) {
-            let result = val
-                .parse::<T>()
-                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
-
-            Ok(Some(result))
-        } else {
-            Ok(None)
-        }
-    }
-
     ///
     /// Note: if you call this multiple times for the same option, the config
     /// file will a line for each call. It would be nice to have a function
@@ -154,48 +83,8 @@ fn escape_str(s: &str) -> String {
     }
 }
 
-/// De-escape a possibly-quoted value.
-///
-/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
-/// does this.
-fn deescape_str(s: &str) -> Result<String> {
-    // If the string has a quote at the beginning and end, strip them out.
-    if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
-        let mut result = String::new();
-
-        let mut iter = s[1..(s.len() - 1)].chars().peekable();
-        while let Some(c) = iter.next() {
-            let newc = if c == '\\' {
-                match iter.next() {
-                    Some('b') => '\x08',
-                    Some('f') => '\x0c',
-                    Some('n') => '\n',
-                    Some('r') => '\r',
-                    Some('t') => '\t',
-                    Some('0'..='7') => {
-                        // TODO
-                        bail!("octal escapes not supported");
-                    }
-                    Some(n) => n,
-                    None => break,
-                }
-            } else if c == '\'' && iter.peek() == Some(&'\'') {
-                // doubled quote becomes just one quote
-                iter.next().unwrap()
-            } else {
-                c
-            };
-
-            result.push(newc);
-        }
-        Ok(result)
-    } else {
-        Ok(s.to_string())
-    }
-}
-
 #[test]
-fn test_postgresql_conf_escapes() -> Result<()> {
+fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
     assert_eq!(escape_str("foo bar"), "'foo bar'");
     // these don't need to be quoted
     assert_eq!(escape_str("foo"), "foo");
@@ -214,13 +103,5 @@ fn test_postgresql_conf_escapes() -> Result<()> {
     assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
     assert_eq!(escape_str("10 cats"), "'10 cats'");
 
-    // Test de-escaping
-    assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
-    assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
-    assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
-
-    // octal-escapes are currently not supported
-    assert!(deescape_str("'foo\\7\\07\\007'").is_err());
-
     Ok(())
 }

From a523548ed1791c683efe76cf5a2f42443846e358 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:16:15 +0300
Subject: [PATCH 31/49] Remove unused cleanup_remaining_timeline_fs_traces
 function

There's some more code that still checks for uninit and delete
markers, see callers of is_delete_mark and is_uninit_mark, and github
issue #5718. But these functions were outright dead.
---
 pageserver/src/config.rs                 | 14 +---------
 pageserver/src/tenant/timeline/delete.rs | 35 +-----------------------
 2 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index e9f197ec2d..525d9afebc 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -13,7 +13,6 @@ use pageserver_api::{
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
-use utils::crashsafe::path_with_suffix_extension;
 use utils::logging::SecretString;
 
 use once_cell::sync::OnceCell;
@@ -33,7 +32,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::virtual_file;
 use crate::virtual_file::io_engine;
-use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
+use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
 
 /// Global state of pageserver.
 ///
@@ -257,17 +256,6 @@ impl PageServerConf {
             .join(timeline_id.to_string())
     }
 
-    pub(crate) fn timeline_delete_mark_file_path(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Utf8PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_shard_id, &timeline_id),
-            TIMELINE_DELETE_MARK_SUFFIX,
-        )
-    }
-
     /// Turns storage remote path of a file into its local path.
     pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
         remote_path.with_base(&self.workdir)
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index dc4118bb4a..90db08ea81 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -135,25 +135,6 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
         .context("delete_all")
 }
 
-// This function removs remaining traces of a timeline on disk.
-// Namely: metadata file, timeline directory, delete mark.
-// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
-// delete mark should be present because it is the last step during deletion.
-// (nothing can fail after its deletion)
-async fn cleanup_remaining_timeline_fs_traces(
-    conf: &PageServerConf,
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<()> {
-    // Remove delete mark
-    // TODO: once we are confident that no more exist in the field, remove this
-    // line.  It cleans up a legacy marker file that might in rare cases be present.
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove delete mark")
-}
-
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
@@ -194,12 +175,10 @@ async fn remove_timeline_from_tenant(
 /// 7. Delete mark file
 ///
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
+/// There are two entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
 ///    and we possibly neeed to continue deletion of remote files.
-/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-///    index but still have local metadata, timeline directory and delete mark.
 ///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
@@ -311,18 +290,6 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
-    #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn cleanup_remaining_timeline_fs_traces(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
-        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
-                .await;
-        info!("Done");
-        r
-    }
-
     fn prepare(
         tenant: &Tenant,
         timeline_id: TimelineId,

From 2753abc0d88cad4ac0e9d96f95fddb7515ff7204 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:10:09 +0300
Subject: [PATCH 32/49] Remove leftover enums for configuring vectored get
 implementation

The settings were removed in commit corb9d2c7b.
---
 libs/pageserver_api/src/config.rs | 34 -------------------------------
 1 file changed, 34 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 1194ee93ef..fa6f594ea5 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -173,40 +173,6 @@ impl Default for EvictionOrder {
     }
 }
 
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetVectoredImpl {
-    Sequential,
-    Vectored,
-}
-
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetImpl {
-    Legacy,
-    Vectored,
-}
-
 #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);

From 5c68e6a1724361824e64dc84b9523809a1581e7b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:43:06 +0300
Subject: [PATCH 33/49] Remove unused constant

The code that used it was removed in commit b9d2c7bdd5
---
 libs/pageserver_api/src/config.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index fa6f594ea5..425e710372 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -304,8 +304,6 @@ pub mod defaults {
     pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
         ImageCompressionAlgorithm::Zstd { level: Some(1) };
 
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
-
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;

From 06d55a3b12b551902d9a1484459a2da71082a0ca Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:38:42 +0300
Subject: [PATCH 34/49] Clean up concurrent logical size calc semaphore
 initialization

The DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES constant was
unused, because we had just hardcoded it to 1 where the constant
should've been used.

Remove the ConfigurableSemaphore::Default implementation, since it was
unused.
---
 libs/pageserver_api/src/config.rs |  5 ++++-
 pageserver/src/config.rs          | 11 -----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 425e710372..1eb0757592 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -340,7 +340,10 @@ impl Default for ConfigToml {
 
             concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
                 .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
+            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
+                DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
+            )
+            .unwrap(),
             metric_collection_interval: (humantime::parse_duration(
                 DEFAULT_METRIC_COLLECTION_INTERVAL,
             )
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 525d9afebc..8567c6aa52 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -479,11 +479,6 @@ pub struct ConfigurableSemaphore {
 }
 
 impl ConfigurableSemaphore {
-    pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
-        Some(x) => x,
-        None => panic!("const unwrap is not yet stable"),
-    };
-
     /// Initializse using a non-zero amount of permits.
     ///
     /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
@@ -504,12 +499,6 @@ impl ConfigurableSemaphore {
     }
 }
 
-impl Default for ConfigurableSemaphore {
-    fn default() -> Self {
-        Self::new(Self::DEFAULT_INITIAL)
-    }
-}
-
 impl PartialEq for ConfigurableSemaphore {
     fn eq(&self, other: &Self) -> bool {
         // the number of permits can be increased at runtime, so we cannot really fulfill the

From 7c489092b7c63c0b3e42597b976522face0baaf8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:46:09 +0300
Subject: [PATCH 35/49] Remove unused duplicate DEFAULT_INGEST_BATCH_SIZE
 constant

This constant in 'tenant_conf_defaults' was unused, but there's
another constant with the same name in the global 'defaults'. I wish
the setting was configurable per-tenant, but it isn't, so let's remove
the confusing duplicate.
---
 libs/pageserver_api/src/config.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 1eb0757592..61e32bc9ab 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -434,8 +434,6 @@ pub mod tenant_conf_defaults {
     // By default ingest enough WAL for two new L0 layers before checking if new image
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
 impl Default for TenantConfigToml {

From 32a0e759bd57f40af6c168f3122a761fb154b5b1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 13 Sep 2024 01:28:12 +0300
Subject: [PATCH 36/49] safekeeper: add wal_last_modified to debug_dump.

Adds to debug_dump option to include highest modified time among all WAL
segments. In passing replace some str with OsStr to have less unwraps.
---
 libs/postgres_ffi/src/xlog_utils.rs           | 34 ++++++++++++++-----
 libs/postgres_ffi/wal_craft/src/lib.rs        |  5 +--
 .../wal_craft/src/xlog_utils_test.rs          | 19 ++++++-----
 safekeeper/src/debug_dump.rs                  | 33 ++++++++++++++++++
 safekeeper/src/http/routes.rs                 |  4 +++
 safekeeper/src/wal_storage.rs                 | 25 ++++++--------
 test_runner/regress/test_wal_acceptor.py      |  1 +
 7 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 1873734753..a636bd2a97 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -26,6 +26,7 @@ use bytes::{Buf, Bytes};
 use log::*;
 
 use serde::Serialize;
+use std::ffi::OsStr;
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
@@ -78,19 +79,34 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
     )
 }
 
-pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
-    let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
-    let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
-    let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
-    (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
+pub fn XLogFromFileName(
+    fname: &OsStr,
+    wal_seg_size: usize,
+) -> anyhow::Result<(XLogSegNo, TimeLineID)> {
+    if let Some(fname_str) = fname.to_str() {
+        let tli = u32::from_str_radix(&fname_str[0..8], 16)?;
+        let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo;
+        let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo;
+        Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli))
+    } else {
+        anyhow::bail!("non-ut8 filename: {:?}", fname);
+    }
 }
 
-pub fn IsXLogFileName(fname: &str) -> bool {
-    return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
+pub fn IsXLogFileName(fname: &OsStr) -> bool {
+    if let Some(fname) = fname.to_str() {
+        fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit())
+    } else {
+        false
+    }
 }
 
-pub fn IsPartialXLogFileName(fname: &str) -> bool {
-    fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
+pub fn IsPartialXLogFileName(fname: &OsStr) -> bool {
+    if let Some(fname) = fname.to_str() {
+        fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8]))
+    } else {
+        false
+    }
 }
 
 /// If LSN points to the beginning of the page, then shift it to first record,
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index ddaafe65f1..5c0abda522 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -7,6 +7,7 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{
     XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
+use std::ffi::OsStr;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -135,8 +136,8 @@ impl Conf {
 
     pub fn pg_waldump(
         &self,
-        first_segment_name: &str,
-        last_segment_name: &str,
+        first_segment_name: &OsStr,
+        last_segment_name: &OsStr,
     ) -> anyhow::Result<std::process::Output> {
         let first_segment_file = self.datadir.join(first_segment_name);
         let last_segment_file = self.datadir.join(last_segment_name);
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 79d45de67a..9eb3f0e95a 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -4,6 +4,7 @@ use super::*;
 use crate::{error, info};
 use regex::Regex;
 use std::cmp::min;
+use std::ffi::OsStr;
 use std::fs::{self, File};
 use std::io::Write;
 use std::{env, str::FromStr};
@@ -54,7 +55,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
         .wal_dir()
         .read_dir()
         .unwrap()
-        .map(|f| f.unwrap().file_name().into_string().unwrap())
+        .map(|f| f.unwrap().file_name())
         .filter(|fname| IsXLogFileName(fname))
         .max()
         .unwrap();
@@ -70,11 +71,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
             start_lsn
         );
         for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
-            let fname = file.file_name().into_string().unwrap();
+            let fname = file.file_name();
             if !IsXLogFileName(&fname) {
                 continue;
             }
-            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap();
             let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
             if seg_start_lsn > u64::from(*start_lsn) {
                 continue;
@@ -93,10 +94,10 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
     }
 }
 
-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
+fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
     // Get the actual end of WAL by pg_waldump
     let waldump_output = cfg
-        .pg_waldump("000000010000000000000001", last_segment)
+        .pg_waldump(OsStr::new("000000010000000000000001"), last_segment)
         .unwrap()
         .stderr;
     let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
@@ -117,7 +118,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
 
 fn check_end_of_wal(
     cfg: &crate::Conf,
-    last_segment: &str,
+    last_segment: &OsStr,
     start_lsn: Lsn,
     expected_end_of_wal: Lsn,
 ) {
@@ -132,7 +133,8 @@ fn check_end_of_wal(
     // Rename file to partial to actually find last valid lsn, then rename it back.
     fs::rename(
         cfg.wal_dir().join(last_segment),
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir()
+            .join(format!("{}.partial", last_segment.to_str().unwrap())),
     )
     .unwrap();
     let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
@@ -142,7 +144,8 @@ fn check_end_of_wal(
     );
     assert_eq!(wal_end, expected_end_of_wal);
     fs::rename(
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir()
+            .join(format!("{}.partial", last_segment.to_str().unwrap())),
         cfg.wal_dir().join(last_segment),
     )
     .unwrap();
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index 15b0272cd9..589536c7a8 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -17,6 +17,7 @@ use postgres_ffi::MAX_SEND_SIZE;
 use serde::Deserialize;
 use serde::Serialize;
 
+use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName};
 use sha2::{Digest, Sha256};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
@@ -51,6 +52,9 @@ pub struct Args {
     /// Dump full term history. True by default.
     pub dump_term_history: bool,
 
+    /// Dump last modified time of WAL segments. Uses value of `dump_all` by default.
+    pub dump_wal_last_modified: bool,
+
     /// Filter timelines by tenant_id.
     pub tenant_id: Option<TenantId>,
 
@@ -128,12 +132,19 @@ async fn build_from_tli_dump(
         None
     };
 
+    let wal_last_modified = if args.dump_wal_last_modified {
+        get_wal_last_modified(timeline_dir).ok().flatten()
+    } else {
+        None
+    };
+
     Timeline {
         tenant_id: timeline.ttid.tenant_id,
         timeline_id: timeline.ttid.timeline_id,
         control_file,
         memory,
         disk_content,
+        wal_last_modified,
     }
 }
 
@@ -156,6 +167,7 @@ pub struct Timeline {
     pub control_file: Option<TimelinePersistentState>,
     pub memory: Option<Memory>,
     pub disk_content: Option<DiskContent>,
+    pub wal_last_modified: Option<DateTime<Utc>>,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -302,6 +314,27 @@ fn build_file_info(entry: DirEntry) -> Result<FileInfo> {
     })
 }
 
+/// Get highest modified time of WAL segments in the directory.
+fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
+    let mut res = None;
+    for entry in fs::read_dir(path)? {
+        if entry.is_err() {
+            continue;
+        }
+        let entry = entry?;
+        /* Ignore files that are not XLOG segments */
+        let fname = entry.file_name();
+        if !IsXLogFileName(&fname) && !IsPartialXLogFileName(&fname) {
+            continue;
+        }
+
+        let metadata = entry.metadata()?;
+        let modified: DateTime<Utc> = DateTime::from(metadata.modified()?);
+        res = std::cmp::max(res, Some(modified));
+    }
+    Ok(res)
+}
+
 /// Converts SafeKeeperConf to Config, filtering out the fields that are not
 /// supposed to be exposed.
 fn build_config(config: SafeKeeperConf) -> Config {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index e482edea55..b4590fe3e5 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -481,6 +481,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     let mut dump_memory: Option<bool> = None;
     let mut dump_disk_content: Option<bool> = None;
     let mut dump_term_history: Option<bool> = None;
+    let mut dump_wal_last_modified: Option<bool> = None;
     let mut tenant_id: Option<TenantId> = None;
     let mut timeline_id: Option<TimelineId> = None;
 
@@ -494,6 +495,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
             "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?),
             "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?),
             "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?),
+            "dump_wal_last_modified" => dump_wal_last_modified = Some(parse_kv_str(&k, &v)?),
             "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?),
             "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?),
             _ => Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -508,6 +510,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     let dump_memory = dump_memory.unwrap_or(dump_all);
     let dump_disk_content = dump_disk_content.unwrap_or(dump_all);
     let dump_term_history = dump_term_history.unwrap_or(true);
+    let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
 
     let args = debug_dump::Args {
         dump_all,
@@ -515,6 +518,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
         dump_memory,
         dump_disk_content,
         dump_term_history,
+        dump_wal_last_modified,
         tenant_id,
         timeline_id,
     };
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 46c260901d..6e7da94973 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -539,20 +539,17 @@ async fn remove_segments_from_disk(
     while let Some(entry) = entries.next_entry().await? {
         let entry_path = entry.path();
         let fname = entry_path.file_name().unwrap();
-
-        if let Some(fname_str) = fname.to_str() {
-            /* Ignore files that are not XLOG segments */
-            if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) {
-                continue;
-            }
-            let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
-            if remove_predicate(segno) {
-                remove_file(entry_path).await?;
-                n_removed += 1;
-                min_removed = min(min_removed, segno);
-                max_removed = max(max_removed, segno);
-                REMOVED_WAL_SEGMENTS.inc();
-            }
+        /* Ignore files that are not XLOG segments */
+        if !IsXLogFileName(fname) && !IsPartialXLogFileName(fname) {
+            continue;
+        }
+        let (segno, _) = XLogFromFileName(fname, wal_seg_size)?;
+        if remove_predicate(segno) {
+            remove_file(entry_path).await?;
+            n_removed += 1;
+            min_removed = min(min_removed, segno);
+            max_removed = max(max_removed, segno);
+            REMOVED_WAL_SEGMENTS.inc();
         }
     }
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 4bf8cfe88f..8ee548bdb0 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -892,6 +892,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     log.info(f"debug_dump before reboot {debug_dump_0}")
     assert debug_dump_0["timelines_count"] == 1
     assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id)
+    assert debug_dump_0["timelines"][0]["wal_last_modified"] != ""
 
     endpoint.safe_psql("create table t(i int)")
 

From 21eeafaaa58326bea8411b5b28db58fa0755e47e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 19 Sep 2024 14:51:00 +0100
Subject: [PATCH 37/49] pageserver: simple fix for vectored read image layer
 skip (#9026)

## Problem

Different keyspaces may require different floor LSNs in vectored
delta layer visits. This patch adds support for such cases.

## Summary of changes

Different keyspaces wishing to read the same layer might
require different stop lsns (or lsn floor). The start LSN
of the read (or the lsn ceil) will always be the same.

With this observation, we fix skipping of image layers by
indexing the fringe by layer id plus lsn floor.

This is very simple, but means that we can visit delta layers twice
in certain cases. Still, I think it's very unlikely for any extra
merging to have taken place in this case, so perhaps it makes sense to go
with the simpler patch.

Fixes https://github.com/neondatabase/neon/issues/9012
Alternative to https://github.com/neondatabase/neon/pull/9025
---
 pageserver/src/tenant.rs               | 146 ++++++++++++++++++++++++-
 pageserver/src/tenant/storage_layer.rs |  52 +++++----
 2 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c6f0e48101..14cb6f508d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4164,9 +4164,18 @@ pub(crate) mod harness {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
             if records_neon {
                 // For Neon wal records, we can decode without spawning postgres, so do so.
-                let base_img = base_img.expect("Neon WAL redo requires base image").1;
-                let mut page = BytesMut::new();
-                page.extend_from_slice(&base_img);
+                let mut page = match (base_img, records.first()) {
+                    (Some((_lsn, img)), _) => {
+                        let mut page = BytesMut::new();
+                        page.extend_from_slice(&img);
+                        page
+                    }
+                    (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
+                    _ => {
+                        panic!("Neon WAL redo requires base image or will init record");
+                    }
+                };
+
                 for (record_lsn, record) in records {
                     apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                 }
@@ -8470,4 +8479,135 @@ mod tests {
 
         Ok(())
     }
+
+    // Regression test for https://github.com/neondatabase/neon/issues/9012
+    // Create an image arrangement where we have to read at different LSN ranges
+    // from a delta layer. This is achieved by overlapping an image layer on top of
+    // a delta layer. Like so:
+    //
+    //     A      B
+    // +----------------+ -> delta_layer
+    // |                |                           ^ lsn
+    // |       =========|-> nested_image_layer      |
+    // |       C        |                           |
+    // +----------------+                           |
+    // ======== -> baseline_image_layer             +-------> key
+    //
+    //
+    // When querying the key range [A, B) we need to read at different LSN ranges
+    // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
+    #[tokio::test]
+    async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let mut delta_layer_spec = Vec::default();
+        let delta_layer_start_lsn = Lsn(0x20);
+        let mut delta_layer_end_lsn = delta_layer_start_lsn;
+
+        for i in 0..10 {
+            let key = get_key(i);
+            let key_in_nested = nested_img_layer
+                .iter()
+                .any(|(key_with_img, _)| *key_with_img == key);
+            let lsn = {
+                if key_in_nested {
+                    Lsn(nested_image_layer_lsn.0 + 0x10)
+                } else {
+                    delta_layer_start_lsn
+                }
+            };
+
+            let will_init = will_init_keys.contains(&i);
+            if will_init {
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+
+                expected_key_values.insert(key, "".to_string());
+            } else {
+                let delta = format!("@{lsn}");
+                delta_layer_spec.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+            }
+            delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
+        }
+
+        delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
+
+        assert!(
+            nested_image_layer_lsn > delta_layer_start_lsn
+                && nested_image_layer_lsn < delta_layer_end_lsn
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    delta_layer_start_lsn..delta_layer_end_lsn,
+                    delta_layer_spec,
+                )], // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                delta_layer_end_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value));
+        }
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index dac6b2f893..cd252aa371 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -276,6 +276,16 @@ pub(crate) enum LayerId {
     InMemoryLayerId(InMemoryLayerFileId),
 }
 
+/// Uniquely identify a layer visit by the layer
+/// and LSN floor (or start LSN) of the reads.
+/// The layer itself is not enough since we may
+/// have different LSN lower bounds for delta layer reads.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+struct LayerToVisitId {
+    layer_id: LayerId,
+    lsn_floor: Lsn,
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -287,9 +297,9 @@ pub(crate) enum ReadableLayer {
 
 /// A partial description of a read to be done.
 #[derive(Debug, Clone)]
-struct ReadDesc {
+struct LayerVisit {
     /// An id used to resolve the readable layer within the fringe
-    layer_id: LayerId,
+    layer_to_visit_id: LayerToVisitId,
     /// Lsn range for the read, used for selecting the next read
     lsn_range: Range<Lsn>,
 }
@@ -303,12 +313,12 @@ struct ReadDesc {
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<LayerId, LayerKeyspace>,
+    planned_visits_by_lsn: BinaryHeap<LayerVisit>,
+    visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
 }
 
 #[derive(Debug)]
-struct LayerKeyspace {
+struct LayerVisitReads {
     layer: ReadableLayer,
     target_keyspace: KeySpaceRandomAccum,
 }
@@ -316,23 +326,23 @@ struct LayerKeyspace {
 impl LayerFringe {
     pub(crate) fn new() -> Self {
         LayerFringe {
-            planned_reads_by_lsn: BinaryHeap::new(),
-            layers: HashMap::new(),
+            planned_visits_by_lsn: BinaryHeap::new(),
+            visit_reads: HashMap::new(),
         }
     }
 
     pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_reads_by_lsn.pop() {
+        let read_desc = match self.planned_visits_by_lsn.pop() {
             Some(desc) => desc,
             None => return None,
         };
 
-        let removed = self.layers.remove_entry(&read_desc.layer_id);
+        let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
 
         match removed {
             Some((
                 _,
-                LayerKeyspace {
+                LayerVisitReads {
                     layer,
                     mut target_keyspace,
                 },
@@ -351,20 +361,24 @@ impl LayerFringe {
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
     ) {
-        let layer_id = layer.id();
-        let entry = self.layers.entry(layer_id.clone());
+        let layer_to_visit_id = LayerToVisitId {
+            layer_id: layer.id(),
+            lsn_floor: lsn_range.start,
+        };
+
+        let entry = self.visit_reads.entry(layer_to_visit_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
                 entry.get_mut().target_keyspace.add_keyspace(keyspace);
             }
             Entry::Vacant(entry) => {
-                self.planned_reads_by_lsn.push(ReadDesc {
+                self.planned_visits_by_lsn.push(LayerVisit {
                     lsn_range,
-                    layer_id: layer_id.clone(),
+                    layer_to_visit_id: layer_to_visit_id.clone(),
                 });
                 let mut accum = KeySpaceRandomAccum::new();
                 accum.add_keyspace(keyspace);
-                entry.insert(LayerKeyspace {
+                entry.insert(LayerVisitReads {
                     layer,
                     target_keyspace: accum,
                 });
@@ -379,7 +393,7 @@ impl Default for LayerFringe {
     }
 }
 
-impl Ord for ReadDesc {
+impl Ord for LayerVisit {
     fn cmp(&self, other: &Self) -> Ordering {
         let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
         if ord == std::cmp::Ordering::Equal {
@@ -390,19 +404,19 @@ impl Ord for ReadDesc {
     }
 }
 
-impl PartialOrd for ReadDesc {
+impl PartialOrd for LayerVisit {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl PartialEq for ReadDesc {
+impl PartialEq for LayerVisit {
     fn eq(&self, other: &Self) -> bool {
         self.lsn_range == other.lsn_range
     }
 }
 
-impl Eq for ReadDesc {}
+impl Eq for LayerVisit {}
 
 impl ReadableLayer {
     pub(crate) fn id(&self) -> LayerId {

From ff9f065c4386496193d62f1ff1fadd28cce92910 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Sep 2024 10:43:12 -0400
Subject: [PATCH 38/49] impr(pageserver): log image layer creation (#9050)

https://github.com/neondatabase/neon/pull/9028 changed the image layer
creation log into trace level. However, I personally find logging image
layer creation useful when reading the logs -- it makes it clear that
the image layer creation is happening and gives a clear idea of the
progress. Therefore, I propose to continue logging them for
create_image_layers set of functions.

## Summary of changes

* Add info logging for all image layers created in legacy compaction.
* Add info logging for all layers creation in testing functions.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f66491d962..a06cea2c66 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4015,6 +4015,7 @@ impl Timeline {
             // partition, so flush it to disk.
             let (desc, path) = image_layer_writer.finish(ctx).await?;
             let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!("created image layer for rel {}", image_layer.local_path());
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -4104,6 +4105,10 @@ impl Timeline {
             // partition, so flush it to disk.
             let (desc, path) = image_layer_writer.finish(ctx).await?;
             let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!(
+                "created image layer for metadata {}",
+                image_layer.local_path()
+            );
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -5407,7 +5412,7 @@ impl Timeline {
         }
         let (desc, path) = image_layer_writer.finish(ctx).await?;
         let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-
+        info!("force created image layer {}", image_layer.local_path());
         {
             let mut guard = self.layers.write().await;
             guard.open_mut().unwrap().force_insert_layer(image_layer);
@@ -5486,7 +5491,7 @@ impl Timeline {
         }
         let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
         let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-
+        info!("force created delta layer {}", delta_layer.local_path());
         {
             let mut guard = self.layers.write().await;
             guard.open_mut().unwrap().force_insert_layer(delta_layer);

From 0a1ca7670cbaddb2e83b4e41142b8a7f5fcf0aef Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 19 Sep 2024 16:09:30 +0100
Subject: [PATCH 39/49] proxy: remove auth info from http conn info & fixup jwt
 api trait (#9047)

misc changes split out from #8855

- **allow cloning the request context in a read-only fashion for
background tasks**
- **propagate endpoint and request context through the jwk cache**
- **only allow password based auth for md5 during testing**
- **remove auth info from conn info**
---
 proxy/src/auth/backend.rs             | 15 ++++------
 proxy/src/auth/backend/hacks.rs       | 14 ++++-----
 proxy/src/auth/backend/jwt.rs         | 43 ++++++++++++++++++++++-----
 proxy/src/auth/backend/local.rs       | 10 +++++--
 proxy/src/console/provider.rs         |  1 +
 proxy/src/context.rs                  | 34 +++++++++++++++++++++
 proxy/src/metrics.rs                  | 20 +++++++++++++
 proxy/src/serverless/backend.rs       |  9 +-----
 proxy/src/serverless/conn_pool.rs     |  9 ++++--
 proxy/src/serverless/sql_over_http.rs | 21 ++++++-------
 test_runner/fixtures/neon_fixtures.py |  3 --
 11 files changed, 127 insertions(+), 52 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5561c9c56d..5bc2f2ff65 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -163,6 +163,7 @@ impl ComputeUserInfo {
 }
 
 pub(crate) enum ComputeCredentialKeys {
+    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
     None,
@@ -293,16 +294,10 @@ async fn auth_quirks(
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match user_info.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
-
-            ctx.set_endpoint_id(res.info.endpoint.clone());
-            let password = match res.keys {
-                ComputeCredentialKeys::Password(p) => p,
-                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
-                    unreachable!("password hack should return a password")
-                }
-            };
-            (res.info, Some(password))
+            let (info, password) =
+                hacks::password_hack_no_authentication(ctx, info, client).await?;
+            ctx.set_endpoint_id(info.endpoint.clone());
+            (info, Some(password))
         }
         Ok(info) => (info, None),
     };
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index e9019ce2cf..15123a2623 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,6 +1,4 @@
-use super::{
-    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
-};
+use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
 use crate::{
     auth::{self, AuthFlow},
     config::AuthenticationConfig,
@@ -63,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
     ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<ComputeCredentials> {
+) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
     warn!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -79,12 +77,12 @@ pub(crate) async fn password_hack_no_authentication(
     info!(project = &*payload.endpoint, "received missing parameter");
 
     // Report tentative success; compute node will check the password anyway.
-    Ok(ComputeCredentials {
-        info: ComputeUserInfo {
+    Ok((
+        ComputeUserInfo {
             user: info.user,
             options: info.options,
             endpoint: payload.endpoint,
         },
-        keys: ComputeCredentialKeys::Password(payload.password),
-    })
+        payload.password,
+    ))
 }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 1f44e4af5d..94e5999a5f 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -25,6 +25,8 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
         &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
         role_name: RoleName,
     ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
@@ -101,7 +103,9 @@ impl JwkCacheEntryLock {
     async fn renew_jwks<F: FetchAuthRules>(
         &self,
         _permit: JwkRenewalPermit<'_>,
+        ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         auth_rules: &F,
     ) -> anyhow::Result<Arc<JwkCacheEntry>> {
@@ -115,7 +119,9 @@ impl JwkCacheEntryLock {
             }
         }
 
-        let rules = auth_rules.fetch_auth_rules(role_name).await?;
+        let rules = auth_rules
+            .fetch_auth_rules(ctx, endpoint, role_name)
+            .await?;
         let mut key_sets =
             ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
         // TODO(conrad): run concurrently
@@ -166,6 +172,7 @@ impl JwkCacheEntryLock {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
@@ -176,7 +183,9 @@ impl JwkCacheEntryLock {
         let Some(cached) = guard else {
             let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self
+                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
+                .await;
         };
 
         let last_update = now.duration_since(cached.last_retrieved);
@@ -187,7 +196,9 @@ impl JwkCacheEntryLock {
             let permit = self.acquire_permit().await;
 
             // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self
+                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
+                .await;
         }
 
         // every 5 minutes we should spawn a job to eagerly update the token.
@@ -198,8 +209,12 @@ impl JwkCacheEntryLock {
                 let entry = self.clone();
                 let client = client.clone();
                 let fetch = fetch.clone();
+                let ctx = ctx.clone();
                 tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
+                    if let Err(e) = entry
+                        .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
+                        .await
+                    {
                         tracing::warn!(error=?e, "could not fetch JWKs in background job");
                     }
                 });
@@ -216,6 +231,7 @@ impl JwkCacheEntryLock {
         ctx: &RequestMonitoring,
         jwt: &str,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         fetch: &F,
     ) -> Result<(), anyhow::Error> {
@@ -242,7 +258,7 @@ impl JwkCacheEntryLock {
         let kid = header.key_id.context("missing key id")?;
 
         let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
+            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
             .await?;
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
@@ -254,7 +270,14 @@ impl JwkCacheEntryLock {
 
                     let permit = self.acquire_permit().await;
                     guard = self
-                        .renew_jwks(permit, client, role_name.clone(), fetch)
+                        .renew_jwks(
+                            permit,
+                            ctx,
+                            client,
+                            endpoint.clone(),
+                            role_name.clone(),
+                            fetch,
+                        )
                         .await?;
                 }
                 _ => {
@@ -318,7 +341,7 @@ impl JwkCache {
         jwt: &str,
     ) -> Result<(), anyhow::Error> {
         // try with just a read lock first
-        let key = (endpoint, role_name.clone());
+        let key = (endpoint.clone(), role_name.clone());
         let entry = self.map.get(&key).as_deref().map(Arc::clone);
         let entry = entry.unwrap_or_else(|| {
             // acquire a write lock after to insert.
@@ -327,7 +350,7 @@ impl JwkCache {
         });
 
         entry
-            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
+            .check_jwt(ctx, jwt, &self.client, endpoint, role_name, fetch)
             .await
     }
 }
@@ -688,6 +711,8 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         impl FetchAuthRules for Fetch {
             async fn fetch_auth_rules(
                 &self,
+                _ctx: &RequestMonitoring,
+                _endpoint: EndpointId,
                 _role_name: RoleName,
             ) -> anyhow::Result<Vec<AuthRule>> {
                 Ok(vec![
@@ -706,6 +731,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         }
 
         let role_name = RoleName::from("user");
+        let endpoint = EndpointId::from("ep");
 
         let jwk_cache = Arc::new(JwkCacheEntryLock::default());
 
@@ -715,6 +741,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                     &RequestMonitoring::test(),
                     &token,
                     &client,
+                    endpoint.clone(),
                     role_name.clone(),
                     &Fetch(addr),
                 )
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 8124f568cf..2ff2ca00f0 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -9,8 +9,9 @@ use crate::{
         messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
         NodeInfo,
     },
+    context::RequestMonitoring,
     intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
-    RoleName,
+    EndpointId, RoleName,
 };
 
 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
@@ -57,7 +58,12 @@ pub struct JwksRoleSettings {
 }
 
 impl FetchAuthRules for StaticAuthRules {
-    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
+    async fn fetch_auth_rules(
+        &self,
+        _ctx: &RequestMonitoring,
+        _endpoint: EndpointId,
+        role_name: RoleName,
+    ) -> anyhow::Result<Vec<AuthRule>> {
         let mappings = JWKS_ROLE_MAP.load();
         let role_mappings = mappings
             .as_deref()
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 12a6e2f12a..16e8da605b 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -303,6 +303,7 @@ impl NodeInfo {
 
     pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
         match keys {
+            #[cfg(any(test, feature = "testing"))]
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
             ComputeCredentialKeys::None => &mut self.config,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index c013218ad9..021659e175 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -79,6 +79,40 @@ pub(crate) enum AuthMethod {
     Cleartext,
 }
 
+impl Clone for RequestMonitoring {
+    fn clone(&self) -> Self {
+        let inner = self.0.try_lock().expect("should not deadlock");
+        let new = RequestMonitoringInner {
+            peer_addr: inner.peer_addr,
+            session_id: inner.session_id,
+            protocol: inner.protocol,
+            first_packet: inner.first_packet,
+            region: inner.region,
+            span: info_span!("background_task"),
+
+            project: inner.project,
+            branch: inner.branch,
+            endpoint_id: inner.endpoint_id.clone(),
+            dbname: inner.dbname.clone(),
+            user: inner.user.clone(),
+            application: inner.application.clone(),
+            error_kind: inner.error_kind,
+            auth_method: inner.auth_method.clone(),
+            success: inner.success,
+            rejected: inner.rejected,
+            cold_start_info: inner.cold_start_info,
+            pg_options: inner.pg_options.clone(),
+
+            sender: None,
+            disconnect_sender: None,
+            latency_timer: LatencyTimer::noop(inner.protocol),
+            disconnect_timestamp: inner.disconnect_timestamp,
+        };
+
+        Self(TryLock::new(new))
+    }
+}
+
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 2da7eac580..c2567e083a 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -397,6 +397,8 @@ pub struct LatencyTimer {
     protocol: Protocol,
     cold_start_info: ColdStartInfo,
     outcome: ConnectOutcome,
+
+    skip_reporting: bool,
 }
 
 impl LatencyTimer {
@@ -409,6 +411,20 @@ impl LatencyTimer {
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
             outcome: ConnectOutcome::Failed,
+            skip_reporting: false,
+        }
+    }
+
+    pub(crate) fn noop(protocol: Protocol) -> Self {
+        Self {
+            start: time::Instant::now(),
+            stop: None,
+            accumulated: Accumulated::default(),
+            protocol,
+            cold_start_info: ColdStartInfo::Unknown,
+            // assume failed unless otherwise specified
+            outcome: ConnectOutcome::Failed,
+            skip_reporting: true,
         }
     }
 
@@ -443,6 +459,10 @@ pub enum ConnectOutcome {
 
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
+        if self.skip_reporting {
+            return;
+        }
+
         let duration = self
             .stop
             .unwrap_or_else(time::Instant::now)
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index d163878528..aa236907db 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     Host,
 };
 
-use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 
 pub(crate) struct PoolingBackend {
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -274,13 +274,6 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
-        match &self.conn_info.auth {
-            AuthData::Jwt(_) => {}
-            AuthData::Password(pw) => {
-                config.password(pw);
-            }
-        }
-
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 6c32d5df0e..a850ecd2be 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -29,11 +29,16 @@ use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 
+#[derive(Debug, Clone)]
+pub(crate) struct ConnInfoWithAuth {
+    pub(crate) conn_info: ConnInfo,
+    pub(crate) auth: AuthData,
+}
+
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
     pub(crate) user_info: ComputeUserInfo,
     pub(crate) dbname: DbName,
-    pub(crate) auth: AuthData,
 }
 
 #[derive(Debug, Clone)]
@@ -787,7 +792,6 @@ mod tests {
                 options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
-            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -845,7 +849,6 @@ mod tests {
                 options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
-            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 06e540d149..7c78439a0a 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -60,6 +60,7 @@ use super::backend::PoolingBackend;
 use super::conn_pool::AuthData;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
+use super::conn_pool::ConnInfoWithAuth;
 use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -148,7 +149,7 @@ fn get_conn_info(
     ctx: &RequestMonitoring,
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
-) -> Result<ConnInfo, ConnInfoError> {
+) -> Result<ConnInfoWithAuth, ConnInfoError> {
     // HTTP only uses cleartext (for now and likely always)
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -235,11 +236,8 @@ fn get_conn_info(
         options: options.unwrap_or_default(),
     };
 
-    Ok(ConnInfo {
-        user_info,
-        dbname,
-        auth,
-    })
+    let conn_info = ConnInfo { user_info, dbname };
+    Ok(ConnInfoWithAuth { conn_info, auth })
 }
 
 // TODO: return different http error codes
@@ -523,7 +521,10 @@ async fn handle_inner(
 
     // TLS config should be there.
     let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
-    info!(user = conn_info.user_info.user.as_str(), "credentials");
+    info!(
+        user = conn_info.conn_info.user_info.user.as_str(),
+        "credentials"
+    );
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
@@ -568,20 +569,20 @@ async fn handle_inner(
                         .authenticate_with_password(
                             ctx,
                             &config.authentication_config,
-                            &conn_info.user_info,
+                            &conn_info.conn_info.user_info,
                             pw,
                         )
                         .await?
                 }
                 AuthData::Jwt(jwt) => {
                     backend
-                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+                        .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt)
                         .await?
                 }
             };
 
             let client = backend
-                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+                .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool)
                 .await?;
             // not strictly necessary to mark success here,
             // but it's just insurance for if we forget it somewhere else
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cbbb162cc6..fc83cf3f7c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3863,9 +3863,6 @@ def static_proxy(
     dbname = vanilla_pg.default_options["dbname"]
     auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"
 
-    # require password for 'http_auth' user
-    vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"])
-
     # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
     vanilla_pg.start()
     vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")

From 1708743e786cde41eb1bba51d4e5267895d8227d Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 19 Sep 2024 12:27:10 -0400
Subject: [PATCH 40/49] pageserver: wait for lsn lease duration after
 transition into AttachedSingle (#9024)

Part of #7497, closes https://github.com/neondatabase/neon/issues/8890.

## Problem

Since leases are in-memory objects, we need to take special care of them
after pageserver restarts and while doing a live migration. The approach
we took for pageserver restart is to wait for at least lease duration
before doing first GC. We want to do the same for live migration. Since
we do not do any GC when a tenant is in `AttachedStale` or
`AttachedMulti` mode, only the transition from `AttachedMulti` to
`AttachedSingle` requires this treatment.

## Summary of changes

- Added `lsn_lease_deadline` field in `GcBlock::reasons`: the tenant is
temporarily blocked from GC until we reach the deadline. This
information does not persist to S3.
- In `GCBlock::start`, skip the GC iteration if we are blocked by the
lsn lease deadline.
- In `TenantManager::upsert_location`, set the lsn_lease_deadline to
`Instant::now() + lsn_lease_length` so the granted leases have a chance
to be renewed before we run GC for the first time after transitioned
from AttachedMulti to AttachedSingle.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/gc_block.rs             | 83 ++++++++++++++-----
 pageserver/src/tenant/mgr.rs                  |  6 ++
 pageserver/src/tenant/tasks.rs                | 18 +---
 test_runner/regress/test_branch_and_gc.py     |  1 +
 test_runner/regress/test_branch_behind.py     |  4 +-
 test_runner/regress/test_branching.py         |  2 +-
 test_runner/regress/test_compaction.py        |  1 +
 test_runner/regress/test_hot_standby.py       |  2 +-
 test_runner/regress/test_layer_eviction.py    |  1 +
 .../regress/test_pageserver_generations.py    |  1 +
 test_runner/regress/test_remote_storage.py    |  2 +
 test_runner/regress/test_sharding.py          |  1 +
 .../regress/test_storage_controller.py        |  2 +-
 test_runner/regress/test_storage_scrubber.py  |  1 +
 test_runner/regress/test_tenant_detach.py     |  4 +-
 .../regress/test_timeline_gc_blocking.py      |  5 +-
 16 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index 8b41ba1746..1271d25b76 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,11 +1,29 @@
-use std::collections::HashMap;
-
-use utils::id::TimelineId;
+use std::{collections::HashMap, time::Duration};
 
 use super::remote_timeline_client::index::GcBlockingReason;
+use tokio::time::Instant;
+use utils::id::TimelineId;
 
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
 
+#[derive(Default)]
+struct Storage {
+    timelines_blocked: TimelinesBlocked,
+    /// The deadline before which we are blocked from GC so that
+    /// leases have a chance to be renewed.
+    lsn_lease_deadline: Option<Instant>,
+}
+
+impl Storage {
+    fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
+        self.lsn_lease_deadline
+            .map(|d| Instant::now() < d)
+            .unwrap_or(false)
+    }
+}
+
+/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
+/// blocking.
 #[derive(Default)]
 pub(crate) struct GcBlock {
     /// The timelines which have current reasons to block gc.
@@ -13,6 +31,12 @@ pub(crate) struct GcBlock {
     /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
     /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
     reasons: std::sync::Mutex<Storage>,
+
+    /// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
+    ///
+    /// Do not add any more features taking and forbidding taking this lock. It should be
+    /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
+    /// synchronizes with gc attempts by locking and unlocking this mutex.
     blocking: tokio::sync::Mutex<()>,
 }
 
@@ -42,6 +66,20 @@ impl GcBlock {
         }
     }
 
+    /// Sets a deadline before which we cannot proceed to GC due to lsn lease.
+    ///
+    /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
+    /// length, we guarantee that all the leases we granted before will have a chance to renew
+    /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
+    pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
+        let deadline = Instant::now() + lsn_lease_length;
+        let mut g = self.reasons.lock().unwrap();
+        g.lsn_lease_deadline = Some(deadline);
+    }
+
+    /// Describe the current gc blocking reasons.
+    ///
+    /// TODO: make this json serializable.
     pub(crate) fn summary(&self) -> Option<BlockingReasons> {
         let g = self.reasons.lock().unwrap();
 
@@ -64,7 +102,7 @@ impl GcBlock {
     ) -> anyhow::Result<bool> {
         let (added, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
+            let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
             let added = set.insert(reason);
 
             // LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -95,7 +133,7 @@ impl GcBlock {
 
         let (remaining_blocks, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
+            match g.timelines_blocked.entry(timeline.timeline_id) {
                 Entry::Occupied(mut oe) => {
                     let set = oe.get_mut();
                     set.remove(reason);
@@ -109,7 +147,7 @@ impl GcBlock {
                 }
             }
 
-            let remaining_blocks = g.len();
+            let remaining_blocks = g.timelines_blocked.len();
 
             // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
             let uploaded = timeline
@@ -134,11 +172,11 @@ impl GcBlock {
     pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
         let unblocked = {
             let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
+            if g.timelines_blocked.is_empty() {
                 return;
             }
 
-            g.remove(&timeline.timeline_id);
+            g.timelines_blocked.remove(&timeline.timeline_id);
 
             BlockingReasons::clean_and_summarize(g).is_none()
         };
@@ -149,10 +187,11 @@ impl GcBlock {
     }
 
     /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
+    pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
         let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+        assert!(g.timelines_blocked.is_empty());
+        g.timelines_blocked
+            .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
 
         if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
             tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -166,6 +205,7 @@ pub(super) struct Guard<'a> {
 
 #[derive(Debug)]
 pub(crate) struct BlockingReasons {
+    tenant_blocked_by_lsn_lease_deadline: bool,
     timelines: usize,
     reasons: enumset::EnumSet<GcBlockingReason>,
 }
@@ -174,8 +214,8 @@ impl std::fmt::Display for BlockingReasons {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
+            "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
+            self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
         )
     }
 }
@@ -183,13 +223,15 @@ impl std::fmt::Display for BlockingReasons {
 impl BlockingReasons {
     fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
         let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
+        g.timelines_blocked.retain(|_key, value| {
             reasons = reasons.union(*value);
             !value.is_empty()
         });
-        if !g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
             Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                 reasons,
             })
         } else {
@@ -198,14 +240,17 @@ impl BlockingReasons {
     }
 
     fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
             None
         } else {
             let reasons = g
+                .timelines_blocked
                 .values()
                 .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
             Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                 reasons,
             })
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2104f41531..1e7c1e10a5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -949,6 +949,12 @@ impl TenantManager {
                 (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                     match attach_conf.generation.cmp(&tenant.generation) {
                         Ordering::Equal => {
+                            if attach_conf.attach_mode == AttachmentMode::Single {
+                                tenant
+                                    .gc_block
+                                    .set_lsn_lease_deadline(tenant.get_lsn_lease_length());
+                            }
+
                             // A transition from Attached to Attached in the same generation, we may
                             // take our fast path and just provide the updated configuration
                             // to the tenant.
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 478e9bb4f0..57f0123d8f 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -346,6 +346,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
 
         let mut first = true;
+        tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
         loop {
             tokio::select! {
                 _ = cancel.cancelled() => {
@@ -363,7 +364,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 first = false;
 
                 let delays = async {
-                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
                     random_init_delay(period, &cancel).await?;
                     Ok::<_, Cancelled>(())
                 };
@@ -538,28 +538,12 @@ pub(crate) async fn random_init_delay(
         let mut rng = rand::thread_rng();
         rng.gen_range(Duration::ZERO..=period)
     };
-
     match tokio::time::timeout(d, cancel.cancelled()).await {
         Ok(_) => Err(Cancelled),
         Err(_) => Ok(()),
     }
 }
 
-/// Delays GC by defaul lease length at restart.
-///
-/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
-/// length, we gurantees that all the leases we granted before the restart will expire
-/// when we run GC for the first time after the restart.
-pub(crate) async fn delay_by_lease_length(
-    length: Duration,
-    cancel: &CancellationToken,
-) -> Result<(), Cancelled> {
-    match tokio::time::timeout(length, cancel.cancelled()).await {
-        Ok(_) => Err(Cancelled),
-        Err(_) => Ok(()),
-    }
-}
-
 struct Iteration {
     started_at: Instant,
     period: Duration,
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index f2e3855c12..d7c4cf059a 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -142,6 +142,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
             "image_creation_threshold": "1",
             # set PITR interval to be small, so we can do GC
             "pitr_interval": "0 s",
+            "lsn_lease_length": "0s",
         }
     )
 
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 0a5336f5a2..2bf7041cf1 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -11,7 +11,9 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"pitr_interval": "0 sec", "lsn_lease_length": "0s"}
+    )
 
     error_regexes = [
         ".*invalid branch start lsn.*",
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 1729e2fc98..3d5c34a595 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -419,7 +419,7 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
 
 
 def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
 
     client = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index be787e0642..cb34551b53 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -240,6 +240,7 @@ def test_uploads_and_deletions(
         "image_creation_threshold": "1",
         "image_layer_creation_check_threshold": "0",
         "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}),
+        "lsn_lease_length": "0s",
     }
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
 
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index d94704012f..35e0c0decb 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -222,7 +222,7 @@ def pgbench_accounts_initialized(ep):
 # Without hs feedback enabled we'd see 'User query might have needed to see row
 # versions that must be removed.' errors.
 def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
     agressive_vacuum_conf = [
         "log_autovacuum_min_duration = 0",
         "autovacuum_naptime = 10s",
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 193149ea03..97093ea535 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -173,6 +173,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
         # "image_creation_threshold": set at runtime
         "compaction_target_size": f"{128 * (1024**2)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
         "image_layer_creation_check_threshold": "0",  # always check if a new image layer can be created
+        "lsn_lease_length": "0s",
     }
 
     def tenant_update_config(changes):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index c923713432..519994f774 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -53,6 +53,7 @@ TENANT_CONF = {
     # create image layers eagerly, so that GC can remove some layers
     "image_creation_threshold": "1",
     "image_layer_creation_check_threshold": "0",
+    "lsn_lease_length": "0s",
 }
 
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 2e5260ca78..0a57fc9605 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -244,6 +244,7 @@ def test_remote_storage_upload_queue_retries(
             # create image layers eagerly, so that GC can remove some layers
             "image_creation_threshold": "1",
             "image_layer_creation_check_threshold": "0",
+            "lsn_lease_length": "0s",
         }
     )
 
@@ -391,6 +392,7 @@ def test_remote_timeline_client_calls_started_metric(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
+            "lsn_lease_length": "0s",
         }
     )
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 4a84dca399..1eb33b2d39 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -200,6 +200,7 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
         # Disable automatic creation of image layers, as we will create them explicitly when we want them
         "image_creation_threshold": 9999,
         "image_layer_creation_check_threshold": 0,
+        "lsn_lease_length": "0s",
     }
 
     neon_env_builder.storage_controller_config = {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index dc90a6e9a0..4106efd4f9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -485,7 +485,7 @@ def test_storage_controller_compute_hook(
     httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
 
     # Start running
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 848e214c5e..b6c19f03f6 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -204,6 +204,7 @@ def test_scrubber_physical_gc_ancestors(
             # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
             # and makes them GC'able
             "pitr_interval": "0s",
+            "lsn_lease_length": "0s",
         },
     )
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index b165588636..e7c6d5a4c3 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -266,13 +266,13 @@ def test_tenant_reattach_while_busy(
 
 
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
     pageserver_http = env.pageserver.http_client()
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
     # create new nenant
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
 
     # assert tenant exists on disk
     assert env.pageserver.tenant_dir(tenant_id).exists()
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index ddfe9b911f..765c72cf2a 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -45,7 +45,10 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
     tenant_after = http.tenant_status(env.initial_tenant)
     assert tenant_before != tenant_after
     gc_blocking = tenant_after["gc_blocking"]
-    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
+    assert (
+        gc_blocking
+        == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }"
+    )
 
     wait_for_another_gc_round()
     pss.assert_log_contains(gc_skipped_line)

From d0cbfda15c916ee75066919940d0c3da714c5b95 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Sep 2024 16:29:28 -0400
Subject: [PATCH 41/49] refactor(pageserver): check layer map valid in one
 place (#9051)

We have 3 places where we implement layer map checks.

## Summary of changes

Now we have a single check function being called in all places.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 31 +++++++++++
 pageserver/src/tenant/checks.rs              | 55 +++++++++++++++++++
 pageserver/src/tenant/timeline.rs            | 33 ++----------
 pageserver/src/tenant/timeline/compaction.rs | 21 +++-----
 storage_scrubber/src/checks.rs               | 56 ++------------------
 5 files changed, 101 insertions(+), 95 deletions(-)
 create mode 100644 pageserver/src/tenant/checks.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 14cb6f508d..d699d56075 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -140,6 +140,7 @@ pub mod metadata;
 pub mod remote_timeline_client;
 pub mod storage_layer;
 
+pub mod checks;
 pub mod config;
 pub mod mgr;
 pub mod secondary;
@@ -1573,6 +1574,9 @@ impl Tenant {
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
         let tline = self
             .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
             .await?;
@@ -1587,6 +1591,18 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                 .await?;
         }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
         Ok(tline)
     }
 
@@ -3197,6 +3213,9 @@ impl Tenant {
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
         let tline = self
             .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
             .await?;
@@ -3217,6 +3236,18 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
                 .await?;
         }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
         Ok(tline)
     }
 
diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs
new file mode 100644
index 0000000000..8eaa8a001c
--- /dev/null
+++ b/pageserver/src/tenant/checks.rs
@@ -0,0 +1,55 @@
+use std::collections::BTreeSet;
+
+use itertools::Itertools;
+
+use super::storage_layer::LayerName;
+
+/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+///
+/// ```plain
+/// |       |                 |       |
+/// |   1   |    |   2   |    |   3   |
+/// |       |    |       |    |       |
+/// ```
+///
+/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
+/// the same LSN range.
+///
+/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
+///
+/// ```plain
+/// |       |    |   2   |    |       |
+/// |   1   |    |-------|    |   3   |
+/// |       |    |   4   |    |       |
+///
+/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
+    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+    let mut all_delta_layers = Vec::new();
+    for name in metadata {
+        if let LayerName::Delta(layer) = name {
+            if layer.key_range.start.next() != layer.key_range.end {
+                all_delta_layers.push(layer.clone());
+            }
+        }
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = &layer.lsn_range;
+        lsn_split_point.insert(lsn_range.start);
+        lsn_split_point.insert(lsn_range.end);
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = layer.lsn_range.clone();
+        let intersects = lsn_split_point.range(lsn_range).collect_vec();
+        if intersects.len() > 1 {
+            let err = format!(
+                "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                layer,
+                intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+            );
+            return Some(err);
+        }
+    }
+    None
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a06cea2c66..f08f5caf95 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5378,7 +5378,8 @@ impl Timeline {
     /// Force create an image layer and place it into the layer map.
     ///
     /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_image_layer(
         self: &Arc<Timeline>,
@@ -5424,7 +5425,8 @@ impl Timeline {
     /// Force create a delta layer and place it into the layer map.
     ///
     /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_delta_layer(
         self: &Arc<Timeline>,
@@ -5450,33 +5452,6 @@ impl Timeline {
         if let Some(check_start_lsn) = check_start_lsn {
             assert!(deltas.lsn_range.start >= check_start_lsn);
         }
-        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
-        // layers of the same start/end LSN, and so should the force inserted layer
-        {
-            /// Checks if a overlaps with b, assume a/b = [start, end).
-            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-
-            if deltas.key_range.start.next() != deltas.key_range.end {
-                let guard = self.layers.read().await;
-                let mut invalid_layers =
-                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
-                        layer.is_delta()
-                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                        && layer.lsn_range != deltas.lsn_range
-                        // skip single-key layer files
-                        && layer.key_range.start.next() != layer.key_range.end
-                    });
-                if let Some(layer) = invalid_layers.next() {
-                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
-                    panic!(
-                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
-                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
-                    );
-                }
-            }
-        }
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d1f06e3480..d1567b6b39 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
@@ -1788,20 +1789,12 @@ impl Timeline {
                 stat.visit_image_layer(desc.file_size());
             }
         }
-        for layer in &layer_selection {
-            let desc = layer.layer_desc();
-            let key_range = &desc.key_range;
-            if desc.is_delta() && key_range.start.next() != key_range.end {
-                let lsn_range = desc.lsn_range.clone();
-                let intersects = lsn_split_point.range(lsn_range).collect_vec();
-                if intersects.len() > 1 {
-                    bail!(
-                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        desc.key(),
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-                }
-            }
+        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+            .iter()
+            .map(|layer| layer.layer_desc().layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("cannot run gc-compaction because {}", err);
         }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = layer_selection
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 15dfb101b5..de6918b3da 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,7 +1,8 @@
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use itertools::Itertools;
+use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
@@ -48,56 +49,6 @@ impl TimelineAnalysis {
     }
 }
 
-/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
-/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
-///
-/// ```plain
-/// |       |                 |       |
-/// |   1   |    |   2   |    |   3   |
-/// |       |    |       |    |       |
-/// ```
-///
-/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
-/// the same LSN range.
-///
-/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
-///
-/// ```plain
-/// |       |    |   2   |    |       |
-/// |   1   |    |-------|    |   3   |
-/// |       |    |   4   |    |       |
-///
-/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
-fn check_valid_layermap(metadata: &HashMap<LayerName, LayerFileMetadata>) -> Option<String> {
-    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
-    let mut all_delta_layers = Vec::new();
-    for (name, _) in metadata.iter() {
-        if let LayerName::Delta(layer) = name {
-            if layer.key_range.start.next() != layer.key_range.end {
-                all_delta_layers.push(layer.clone());
-            }
-        }
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = &layer.lsn_range;
-        lsn_split_point.insert(lsn_range.start);
-        lsn_split_point.insert(lsn_range.end);
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = layer.lsn_range.clone();
-        let intersects = lsn_split_point.range(lsn_range).collect_vec();
-        if intersects.len() > 1 {
-            let err = format!(
-                        "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        layer,
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-            return Some(err);
-        }
-    }
-    None
-}
-
 pub(crate) async fn branch_cleanup_and_check_errors(
     remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
@@ -177,7 +128,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                         }
                     }
 
-                    if let Some(err) = check_valid_layermap(&index_part.layer_metadata) {
+                    let layer_names = index_part.layer_metadata.keys().cloned().collect_vec();
+                    if let Some(err) = check_valid_layermap(&layer_names) {
                         result.errors.push(format!(
                             "index_part.json contains invalid layer map structure: {err}"
                         ));

From f2c08195f068ec4445347641cdde67eb170e60cc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 17 Sep 2024 11:32:49 +0300
Subject: [PATCH 42/49] Bump vendor/postgres.

Includes PRs:
- ERROR out instead of segfaulting when walsender slots are full.
- logical worker: respond to publisher even under dense stream.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a317b9b5b9..87cb68f899 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7
+Subproject commit 87cb68f899db434cd6f1908cf0ac8fdeafdd88c1
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 6f6d77fb59..72b904c0b3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9
+Subproject commit 72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 0baa7346df..3ec6e2496f 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e
+Subproject commit 3ec6e2496f64c6fec35c67cb82efd6490a6a4738
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 9156d63ce2..5bbb9bd93d 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa
+Subproject commit 5bbb9bd93dd805e90bd8af15d00080363d18ec68
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2c34962bb..6289a53670 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "9156d63ce253bed9d1f76355ceec610e444eaffa"
+    "5bbb9bd93dd805e90bd8af15d00080363d18ec68"
   ],
   "v16": [
     "16.4",
-    "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"
+    "3ec6e2496f64c6fec35c67cb82efd6490a6a4738"
   ],
   "v15": [
     "15.8",
-    "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9"
+    "72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1"
   ],
   "v14": [
     "14.13",
-    "a317b9b5b96978b49e78986697f3dd80d06f99a7"
+    "87cb68f899db434cd6f1908cf0ac8fdeafdd88c1"
   ]
 }

From 3104f0f250e99dea03817c8ee6fd4022844db6ea Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 20 Sep 2024 12:00:05 +0100
Subject: [PATCH 43/49] Safekeeper: fix OpenAPI spec (#9066)

## Problem

Safekeeper's OpenAPI spec is incorrect:

```
Semantic error at paths./v1/tenant/{tenant_id}/timeline/{timeline_id}.get.responses.404.content.application/json.schema.$ref
$refs must reference a valid location in the document
Jump to line 126
```
Checked on https://editor.swagger.io

## Summary of changes
- Add `NotFoundError`
- Add `description` and `license` fields to make Cloud OpenAPI spec
linter happy
---
 safekeeper/src/http/openapi_spec.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml
index 70999853c2..3f14075345 100644
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -1,7 +1,11 @@
 openapi: "3.0.2"
 info:
   title: Safekeeper control API
+  description: Neon Safekeeper API
   version: "1.0"
+  license:
+    name: "Apache"
+    url: https://github.com/neondatabase/neon/blob/main/LICENSE
 
 
 servers:
@@ -386,6 +390,12 @@ components:
         msg:
           type: string
 
+    NotFoundError:
+      type: object
+      properties:
+        msg:
+          type: string
+
   responses:
 
     #

From c45b56e0bb34b70c770eb23b6d13156a0d4f9913 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 20 Sep 2024 15:55:50 +0200
Subject: [PATCH 44/49] pageserver: add counters for started smgr/getpage
 requests (#9069)

After this PR

```
curl localhost:9898/metrics | grep smgr_ | grep start
```

```
pageserver_smgr_query_started_count{shard_id="0000",smgr_query_type="get_page_at_lsn",tenant_id="...",timeline_id="..."} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_db_size"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_page_at_lsn"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_exists"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_size"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_slru_segment"} 0
```

We instantiate the per-tenant counter only for `get_page_at_lsn`.
---
 pageserver/src/metrics.rs       | 102 +++++++++++++++++++++++++-------
 test_runner/fixtures/metrics.py |   2 +
 2 files changed, 81 insertions(+), 23 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 72229d80be..abd814f928 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
 }
 
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_metric: &'a Histogram,
+    global_latency_histo: &'a Histogram,
 
     // Optional because not all op types are tracked per-timeline
-    timeline_metric: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<&'a Histogram>,
 
     ctx: &'c RequestContext,
     start: std::time::Instant,
@@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                 elapsed
             }
         };
-        self.global_metric.observe(ex_throttled.as_secs_f64());
-        if let Some(timeline_metric) = self.timeline_metric {
-            timeline_metric.observe(ex_throttled.as_secs_f64());
+        self.global_latency_histo
+            .observe(ex_throttled.as_secs_f64());
+        if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
         }
     }
 }
@@ -1240,10 +1241,32 @@ pub enum SmgrQueryType {
 
 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    global_metrics: [Histogram; SmgrQueryType::COUNT],
-    per_timeline_getpage: Histogram,
+    global_started: [IntCounter; SmgrQueryType::COUNT],
+    global_latency: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage_started: IntCounter,
+    per_timeline_getpage_latency: Histogram,
 }
 
+static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_global_count",
+        "Number of smgr queries started, aggregated by query type.",
+        &["smgr_query_type"],
+    )
+    .expect("failed to define a metric")
+});
+
+static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_count",
+        "Number of smgr queries started, aggregated by query type and tenant/timeline.",
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds",
@@ -1319,14 +1342,20 @@ impl SmgrQueryTimePerTimeline {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
-        let global_metrics = std::array::from_fn(|i| {
+        let global_started = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            SMGR_QUERY_STARTED_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap()
+        });
+        let global_latency = std::array::from_fn(|i| {
             let op = SmgrQueryType::from_repr(i).unwrap();
             SMGR_QUERY_TIME_GLOBAL
                 .get_metric_with_label_values(&[op.into()])
                 .unwrap()
         });
 
-        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+        let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
             .get_metric_with_label_values(&[
                 SmgrQueryType::GetPageAtLsn.into(),
                 &tenant_id,
@@ -1334,9 +1363,20 @@ impl SmgrQueryTimePerTimeline {
                 &timeline_id,
             ])
             .unwrap();
+        let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+
         Self {
-            global_metrics,
-            per_timeline_getpage,
+            global_started,
+            global_latency,
+            per_timeline_getpage_latency,
+            per_timeline_getpage_started,
         }
     }
     pub(crate) fn start_timer<'c: 'a, 'a>(
@@ -1344,8 +1384,11 @@ impl SmgrQueryTimePerTimeline {
         op: SmgrQueryType,
         ctx: &'c RequestContext,
     ) -> Option<impl Drop + '_> {
-        let global_metric = &self.global_metrics[op as usize];
         let start = Instant::now();
+
+        self.global_started[op as usize].inc();
+
+        // We subtract time spent throttled from the observed latency.
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
             Err(error) => {
@@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline {
             }
         }
 
-        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
-            Some(&self.per_timeline_getpage)
+        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            self.per_timeline_getpage_started.inc();
+            Some(&self.per_timeline_getpage_latency)
         } else {
             None
         };
 
         Some(GlobalAndPerTimelineHistogramTimer {
-            global_metric,
-            timeline_metric,
+            global_latency_histo: &self.global_latency[op as usize],
+            per_timeline_latency_histo,
             ctx,
             start,
             op,
@@ -1423,9 +1467,12 @@ mod smgr_query_time_tests {
             let get_counts = || {
                 let global: u64 = ops
                     .iter()
-                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
+                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
                     .sum();
-                (global, metrics.per_timeline_getpage.get_sample_count())
+                (
+                    global,
+                    metrics.per_timeline_getpage_latency.get_sample_count(),
+                )
             };
 
             let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -2576,6 +2623,12 @@ impl TimelineMetrics {
             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
         }
 
+        let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
         let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
             SmgrQueryType::GetPageAtLsn.into(),
             tenant_id,
@@ -3227,11 +3280,14 @@ pub fn preinitialize_metrics() {
     }
 
     // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
+    [
+        &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
+        &SMGR_QUERY_STARTED_GLOBAL,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });
 
     // gauges
     WALRECEIVER_ACTIVE_MANAGERS.get();
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index cda70be8da..d2db40897e 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -132,6 +132,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
     *histogram("pageserver_io_operations_seconds"),
+    "pageserver_smgr_query_started_global_count_total",
     "pageserver_tenant_states_count",
     "pageserver_circuit_breaker_broken_total",
     "pageserver_circuit_breaker_unbroken_total",
@@ -146,6 +147,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
+    "pageserver_smgr_query_started_count_total",
     "pageserver_archive_size",
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",

From 797aa4ffaaacba20a1b27344d28d1c1a0c287e8c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 20 Sep 2024 17:22:58 +0300
Subject: [PATCH 45/49] Skip running clippy in --release mode. (#9073)

It's pretty expensive to run, and there is very little difference
between debug and release builds that could lead to different clippy
warnings.

This is extracted from PR #8912. That PR wandered off into various
improvements we could make, but we seem to have consensus on this part
at least.
---
 .github/workflows/build_and_test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a210c962cb..c1ec3f207b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -159,6 +159,10 @@ jobs:
       # This will catch compiler & clippy warnings in all feature combinations.
       # TODO: use cargo hack for build and test as well, but, that's quite expensive.
       # NB: keep clippy args in sync with ./run_clippy.sh
+      #
+      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
+      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
+      # time just for that, so skip "clippy --release".
       - run: |
           CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
           if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
@@ -168,8 +172,6 @@ jobs:
           echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
       - name: Run cargo clippy (debug)
         run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
-      - name: Run cargo clippy (release)
-        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       - name: Check documentation generation
         run: cargo doc --workspace --no-deps --document-private-items

From 6b9323027085c771ad71686b39f43f19f1a702f6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 20 Sep 2024 10:37:28 -0400
Subject: [PATCH 46/49] fix(pageserver): receive body error now 500 (#9052)

close https://github.com/neondatabase/neon/issues/8903

In https://github.com/neondatabase/neon/issues/8903 we observed JSON
decoding error to have the following error message in the log:

```
Error processing HTTP request: Resource temporarily unavailable: 3956 (pageserver-6.ap-southeast-1.aws.neon.tech) error receiving body: error decoding response body
```

This is hard to understand. In this patch, we make the error message
more reasonable.

## Summary of changes

* receive body error is now an internal server error, passthrough the
`reqwest::Error` (only decoding error) as `anyhow::Error`.
* instead of formatting the error using `to_string`, we use the
alternative `anyhow::Error` formatting, so that it prints out the cause
of the error (i.e., what exactly cannot serde decode).

I would expect seeing something like `error receiving body: error
decoding response body: XXX field not found` after this patch, though I
didn't set up a testing environment to observe the exact behavior.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/utils/src/http/error.rs      |  2 +-
 storage_controller/src/service.rs | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 3d863a6518..5e05e4e713 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -82,7 +82,7 @@ impl ApiError {
                 StatusCode::INTERNAL_SERVER_ERROR,
             ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
+                format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
                 StatusCode::INTERNAL_SERVER_ERROR,
             ),
         }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index be3efaf688..957f633feb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3,6 +3,7 @@ use std::{
     borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
+    error::Error,
     ops::Deref,
     path::PathBuf,
     str::FromStr,
@@ -218,9 +219,16 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
                 format!("{node} error receiving error body: {str}").into(),
             )
         }
-        mgmt_api::Error::ReceiveBody(str) => {
-            // Presume errors receiving body are connectivity/availability issues
-            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
+        mgmt_api::Error::ReceiveBody(err) if err.is_decode() => {
+            // Return 500 for decoding errors.
+            ApiError::InternalServerError(anyhow::Error::from(err).context("error decoding body"))
+        }
+        mgmt_api::Error::ReceiveBody(err) => {
+            // Presume errors receiving body are connectivity/availability issues except for decoding errors
+            let src_str = err.source().map(|e| e.to_string()).unwrap_or_default();
+            ApiError::ResourceUnavailable(
+                format!("{node} error receiving error body: {err} {}", src_str).into(),
+            )
         }
         mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
             ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())

From e675a21346812b109eddc86c5729ec83e25f845d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 20 Sep 2024 16:09:39 +0100
Subject: [PATCH 47/49] utils: leaky bucket should only report throttled if the
 notify queue is blocked on sleep (#9072)

## Problem

Seems that PS might be too eager in reporting throttled tasks

## Summary of changes

Introduce a sleep counter. If the sleep counter increases, then the
acquire tasks was throttled.
---
 libs/utils/src/leaky_bucket.rs | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs
index a120dc0ac5..0cc58738c0 100644
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -21,7 +21,13 @@
 //!
 //! Another explaination can be found here: <https://brandur.org/rate-limiting>
 
-use std::{sync::Mutex, time::Duration};
+use std::{
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Mutex,
+    },
+    time::Duration,
+};
 
 use tokio::{sync::Notify, time::Instant};
 
@@ -128,6 +134,7 @@ impl LeakyBucketState {
 
 pub struct RateLimiter {
     pub config: LeakyBucketConfig,
+    pub sleep_counter: AtomicU64,
     pub state: Mutex<LeakyBucketState>,
     /// a queue to provide this fair ordering.
     pub queue: Notify,
@@ -144,6 +151,7 @@ impl Drop for Requeue<'_> {
 impl RateLimiter {
     pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
         RateLimiter {
+            sleep_counter: AtomicU64::new(0),
             state: Mutex::new(LeakyBucketState::with_initial_tokens(
                 &config,
                 initial_tokens,
@@ -163,15 +171,16 @@ impl RateLimiter {
 
     /// returns true if we did throttle
     pub async fn acquire(&self, count: usize) -> bool {
-        let mut throttled = false;
-
         let start = tokio::time::Instant::now();
 
+        let start_count = self.sleep_counter.load(Ordering::Acquire);
+        let mut end_count = start_count;
+
         // wait until we are the first in the queue
         let mut notified = std::pin::pin!(self.queue.notified());
         if !notified.as_mut().enable() {
-            throttled = true;
             notified.await;
+            end_count = self.sleep_counter.load(Ordering::Acquire);
         }
 
         // notify the next waiter in the queue when we are done.
@@ -184,9 +193,22 @@ impl RateLimiter {
                 .unwrap()
                 .add_tokens(&self.config, start, count as f64);
             match res {
-                Ok(()) => return throttled,
+                Ok(()) => return end_count > start_count,
                 Err(ready_at) => {
-                    throttled = true;
+                    struct Increment<'a>(&'a AtomicU64);
+
+                    impl Drop for Increment<'_> {
+                        fn drop(&mut self) {
+                            self.0.fetch_add(1, Ordering::AcqRel);
+                        }
+                    }
+
+                    // increment the counter after we finish sleeping (or cancel this task).
+                    // this ensures that tasks that have already started the acquire will observe
+                    // the new sleep count when they are allowed to resume on the notify.
+                    let _inc = Increment(&self.sleep_counter);
+                    end_count += 1;
+
                     tokio::time::sleep_until(ready_at).await;
                 }
             }

From 6014f15157b3a47f122b814b3cf109e9d1851abd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 20 Sep 2024 17:07:09 +0100
Subject: [PATCH 48/49] pageserver: suppress noisy "layer became visible" logs
 (#9064)

## Problem

When layer visibility was added, an info log was included for the
situation where actual access to a layer disagrees with the visibility
calculation. This situation is safe, but I was interested in seeing when
it happens.

The log is pretty high volume, so this PR refines it to fire less often.

## Summary of changes

- For cases where accessing non-visible layers is normal, don't log at
all.
- Extend a unit test to increase confidence that the updates to
visibility on access are working as expected
- During compaction, only call the visibility calculation routine if
some image layers were created: previously, frequent calls resulted in
the visibility of layers getting reset every time we passed through
create_image_layers.
---
 pageserver/src/tenant/storage_layer/layer.rs  | 29 +++++++++++++++----
 .../src/tenant/storage_layer/layer/tests.rs   |  9 ++++++
 pageserver/src/tenant/timeline.rs             |  4 ++-
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b15cd4da39..f0e2ca5c83 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -439,11 +439,30 @@ impl Layer {
 
     fn record_access(&self, ctx: &RequestContext) {
         if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
+            // Visibility was modified to Visible: maybe log about this
+            match ctx.task_kind() {
+                TaskKind::CalculateSyntheticSize
+                | TaskKind::GarbageCollector
+                | TaskKind::MgmtRequest => {
+                    // This situation is expected in code paths do binary searches of the LSN space to resolve
+                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
+                    // and on-demand for certain HTTP API requests.
+                }
+                _ => {
+                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
+                    // some branch tip, so we log the fact that we are accessing something that the visibility
+                    // calculation thought should not be visible.
+                    //
+                    // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
+                    // which was covered by a concurrent compaction.
+                    tracing::info!(
+                        "Layer {} became visible as a result of access",
+                        self.0.desc.key()
+                    );
+                }
+            }
+
+            // Update the timeline's visible bytes count
             if let Some(tl) = self.0.timeline.upgrade() {
                 tl.metrics
                     .visible_physical_size_gauge
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 0b9bde4f57..9de70f14ee 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1025,6 +1025,15 @@ fn access_stats() {
     assert_eq!(access_stats.latest_activity(), lowres_time(atime));
     access_stats.set_visibility(LayerVisibilityHint::Visible);
     assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+
+    // Recording access implicitly makes layer visible, if it wasn't already
+    let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
+    assert!(access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert!(!access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
 }
 
 #[test]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f08f5caf95..114a6dd468 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4316,7 +4316,9 @@ impl Timeline {
         timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await?;
+        if !image_layers.is_empty() {
+            self.update_layer_visibility().await?;
+        }
 
         Ok(image_layers)
     }

From ec5dce04ebfa51b727dfc9bc04ebb1e68aef6434 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 20 Sep 2024 18:48:26 +0200
Subject: [PATCH 49/49] pageserver: throttling: per-tenant metrics + more
 metrics to help understand throttle queue depth (#9077)

---
 pageserver/src/metrics.rs         | 198 +++++++++++++++++++++++++-----
 pageserver/src/tenant.rs          |   4 +-
 pageserver/src/tenant/tasks.rs    |  68 +++++-----
 pageserver/src/tenant/throttle.rs |  34 +++--
 pageserver/src/tenant/timeline.rs |  10 +-
 test_runner/fixtures/metrics.py   |  13 ++
 6 files changed, 250 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index abd814f928..078d12f934 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2645,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
         let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
     }
 
+    tenant_throttling::remove_tenant_metrics(tenant_shard_id);
+
     // we leave the BROKEN_TENANTS_SET entry if any
 }
 
@@ -3108,41 +3110,180 @@ pub mod tokio_epoll_uring {
 pub(crate) mod tenant_throttling {
     use metrics::{register_int_counter_vec, IntCounter};
     use once_cell::sync::Lazy;
+    use utils::shard::TenantShardId;
 
     use crate::tenant::{self, throttle::Metric};
 
-    pub(crate) struct TimelineGet {
-        wait_time: IntCounter,
-        count: IntCounter,
+    struct GlobalAndPerTenantIntCounter {
+        global: IntCounter,
+        per_tenant: IntCounter,
     }
 
-    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
-        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-            "pageserver_tenant_throttling_wait_usecs_sum_global",
-            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+    impl GlobalAndPerTenantIntCounter {
+        #[inline(always)]
+        pub(crate) fn inc(&self) {
+            self.inc_by(1)
+        }
+        #[inline(always)]
+        pub(crate) fn inc_by(&self, n: u64) {
+            self.global.inc_by(n);
+            self.per_tenant.inc_by(n);
+        }
+    }
+
+    pub(crate) struct TimelineGet {
+        count_accounted_start: GlobalAndPerTenantIntCounter,
+        count_accounted_finish: GlobalAndPerTenantIntCounter,
+        wait_time: GlobalAndPerTenantIntCounter,
+        count_throttled: GlobalAndPerTenantIntCounter,
+    }
+
+    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start_global",
+            "Count of tenant throttling starts, by kind of throttle.",
             &["kind"]
         )
-            .unwrap()
-        });
-
-        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-                "pageserver_tenant_throttling_count_global",
-                "Count of tenant throttlings, by kind of throttle.",
-                &["kind"]
-            )
-            .unwrap()
-        });
-
-        let kind = "timeline_get";
-        TimelineGet {
-            wait_time: WAIT_USECS.with_label_values(&[kind]),
-            count: WAIT_COUNT.with_label_values(&[kind]),
-        }
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start",
+            "Count of tenant throttling starts, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish_global",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum_global",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
     });
 
-    impl Metric for &'static TimelineGet {
+    static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_global",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+
+    const KIND: &str = "timeline_get";
+
+    impl TimelineGet {
+        pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            TimelineGet {
+                count_accounted_start: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_accounted_finish: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                wait_time: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_throttled: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+            }
+        }
+    }
+
+    pub(crate) fn preinitialize_global_metrics() {
+        Lazy::force(&COUNT_ACCOUNTED_START);
+        Lazy::force(&COUNT_ACCOUNTED_FINISH);
+        Lazy::force(&WAIT_USECS);
+        Lazy::force(&WAIT_COUNT);
+    }
+
+    pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+        for m in &[
+            &COUNT_ACCOUNTED_START_PER_TENANT,
+            &COUNT_ACCOUNTED_FINISH_PER_TENANT,
+            &WAIT_USECS_PER_TENANT,
+            &WAIT_COUNT_PER_TENANT,
+        ] {
+            let _ = m.remove_label_values(&[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ]);
+        }
+    }
+
+    impl Metric for TimelineGet {
+        #[inline(always)]
+        fn accounting_start(&self) {
+            self.count_accounted_start.inc();
+        }
+        #[inline(always)]
+        fn accounting_finish(&self) {
+            self.count_accounted_finish.inc();
+        }
         #[inline(always)]
         fn observe_throttling(
             &self,
@@ -3150,7 +3291,7 @@ pub(crate) mod tenant_throttling {
         ) {
             let val = u64::try_from(wait_time.as_micros()).unwrap();
             self.wait_time.inc_by(val);
-            self.count.inc();
+            self.count_throttled.inc();
         }
     }
 }
@@ -3309,7 +3450,8 @@ pub fn preinitialize_metrics() {
 
     // Custom
     Lazy::force(&RECONSTRUCT_TIME);
-    Lazy::force(&tenant_throttling::TIMELINE_GET);
     Lazy::force(&BASEBACKUP_QUERY_TIME);
     Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+
+    tenant_throttling::preinitialize_global_metrics();
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d699d56075..e328cd2044 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,7 +302,7 @@ pub struct Tenant {
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
     pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -2831,7 +2831,7 @@ impl Tenant {
             gate: Gate::default(),
             timeline_get_throttle: Arc::new(throttle::Throttle::new(
                 Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                &crate::metrics::tenant_throttling::TIMELINE_GET,
+                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 57f0123d8f..341febb30a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -163,8 +163,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     // How many errors we have seen consequtively
     let mut error_run_count = 0;
 
-    let mut last_throttle_flag_reset_at = Instant::now();
-
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -191,8 +189,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             }
 
-
-
             let sleep_duration;
             if period == Duration::ZERO {
                 #[cfg(not(feature = "testing"))]
@@ -207,12 +203,18 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 };
 
                 // Run compaction
-                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
+                let IterationResult { output, elapsed } = iteration
+                    .run(tenant.compaction_iteration(&cancel, &ctx))
+                    .await;
                 match output {
                     Ok(has_pending_task) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
+                        sleep_duration = if has_pending_task {
+                            Duration::ZERO
+                        } else {
+                            period
+                        };
                     }
                     Err(e) => {
                         let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -233,38 +235,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
 
                 // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
+                tracing::debug!(
+                    elapsed_ms = elapsed.as_millis(),
+                    "compaction iteration complete"
+                );
             };
 
-
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
             if let Some(walredo_mgr) = &tenant.walredo_mgr {
                 walredo_mgr.maybe_quiesce(period * 10);
             }
 
-            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
-                if count_throttled == 0 {
-                    return;
-                }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
-                    count_accounted,
-                    count_throttled,
-                    sum_throttled_usecs,
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
-
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
@@ -437,6 +421,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+    let mut last_throttle_flag_reset_at = Instant::now();
         loop {
             tokio::select! {
                 _ = cancel.cancelled() => {
@@ -483,6 +468,29 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                 kind: BackgroundLoopKind::IngestHouseKeeping,
             };
             iteration.run(tenant.ingest_housekeeping()).await;
+
+            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
+            // Or just spawn another background loop for this throttle, it's not like it's super costly.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                info!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted = count_accounted_finish,  // don't break existing log scraping
+                    count_throttled,
+                    sum_throttled_usecs,
+                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds"
+                );
+            });
         }
     }
     .await;
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index f222e708e1..6a80953901 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
 pub struct Throttle<M: Metric> {
     inner: ArcSwap<Inner>,
     metric: M,
-    /// will be turned into [`Stats::count_accounted`]
-    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_start`]
+    count_accounted_start: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_finish`]
+    count_accounted_finish: AtomicU64,
     /// will be turned into [`Stats::count_throttled`]
     count_throttled: AtomicU64,
     /// will be turned into [`Stats::sum_throttled_usecs`]
@@ -43,17 +45,21 @@ pub struct Observation {
     pub wait_time: Duration,
 }
 pub trait Metric {
+    fn accounting_start(&self);
+    fn accounting_finish(&self);
     fn observe_throttling(&self, observation: &Observation);
 }
 
 /// See [`Throttle::reset_stats`].
 pub struct Stats {
-    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
-    pub count_accounted: u64,
-    // Subset of the `accounted` requests that were actually throttled.
-    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    /// Number of requests that started [`Throttle::throttle`] calls.
+    pub count_accounted_start: u64,
+    /// Number of requests that finished [`Throttle::throttle`] calls.
+    pub count_accounted_finish: u64,
+    /// Subset of the `accounted` requests that were actually throttled.
+    /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
     pub count_throttled: u64,
-    // Sum of microseconds that throttled requests spent waiting for throttling.
+    /// Sum of microseconds that throttled requests spent waiting for throttling.
     pub sum_throttled_usecs: u64,
 }
 
@@ -65,7 +71,8 @@ where
         Self {
             inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
             metric,
-            count_accounted: AtomicU64::new(0),
+            count_accounted_start: AtomicU64::new(0),
+            count_accounted_finish: AtomicU64::new(0),
             count_throttled: AtomicU64::new(0),
             sum_throttled_usecs: AtomicU64::new(0),
         }
@@ -117,11 +124,13 @@ where
     /// This method allows retrieving & resetting that flag.
     /// Useful for periodic reporting.
     pub fn reset_stats(&self) -> Stats {
-        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
+        let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
         let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
         let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
         Stats {
-            count_accounted,
+            count_accounted_start,
+            count_accounted_finish,
             count_throttled,
             sum_throttled_usecs,
         }
@@ -139,9 +148,12 @@ where
         };
         let start = std::time::Instant::now();
 
+        self.metric.accounting_start();
+        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
+        self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
+        self.metric.accounting_finish();
 
-        self.count_accounted.fetch_add(1, Ordering::Relaxed);
         if did_throttle {
             self.count_throttled.fetch_add(1, Ordering::Relaxed);
             let now = Instant::now();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 114a6dd468..c98efd5f71 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -196,9 +196,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    pub timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -406,9 +405,8 @@ pub struct Timeline {
     gc_lock: tokio::sync::Mutex<()>,
 
     /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index d2db40897e..005dc6cb0d 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -102,6 +102,11 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
 
 
+def counter(name: str) -> str:
+    # the prometheus_client package appends _total to all counters client-side
+    return f"{name}_total"
+
+
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_timeline_client_calls_started_total",
     "pageserver_remote_timeline_client_calls_finished_total",
@@ -136,6 +141,10 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_tenant_states_count",
     "pageserver_circuit_breaker_broken_total",
     "pageserver_circuit_breaker_unbroken_total",
+    counter("pageserver_tenant_throttling_count_accounted_start_global"),
+    counter("pageserver_tenant_throttling_count_accounted_finish_global"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
+    counter("pageserver_tenant_throttling_count_global"),
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
@@ -159,6 +168,10 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
+    counter("pageserver_tenant_throttling_count_accounted_start"),
+    counter("pageserver_tenant_throttling_count_accounted_finish"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum"),
+    counter("pageserver_tenant_throttling_count"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken