From fef77b0cc981f71238e1117d392ea55ec867e61f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 15 Aug 2024 09:02:33 +0100
Subject: [PATCH 01/44] safekeeper: consider partial uploads when pulling
 timeline (#8628)

## Problem
The control file contains the id of the safekeeper that uploaded it.
Previously, when sending a snapshot of the control file to another sk,
it would eventually be gc-ed by the receiving sk. This is incorrect
because the original sk might still need it later.

## Summary of Changes
When sending a snapshot and the control file contains an uploaded
segment:
* Create a copy of the segment in s3 with the destination sk in the
  object name
* Tweak the streamed control file to point to the object create in the
  previous step

Note that the snapshot endpoint now has to know the id of the requestor,
so the api has been extended to include the node if of the destination
sk.

Closes https://github.com/neondatabase/neon/issues/8542
---
 safekeeper/src/control_file.rs           |  42 +++---
 safekeeper/src/http/client.rs            |   7 +-
 safekeeper/src/http/routes.rs            |  11 +-
 safekeeper/src/pull_timeline.rs          |  64 +++++++---
 safekeeper/src/wal_backup.rs             |  10 ++
 safekeeper/src/wal_backup_partial.rs     |  57 ++++++++-
 test_runner/fixtures/neon_fixtures.py    |  23 +++-
 test_runner/regress/test_wal_acceptor.py | 155 ++++++++++++++++++++++-
 8 files changed, 327 insertions(+), 42 deletions(-)
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index d574bb438f..c551cd3122 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -164,6 +164,30 @@ impl Deref for FileStorage {
     }
 }
 
+impl TimelinePersistentState {
+    pub(crate) fn write_to_buf(&self) -> Result<Vec<u8>> {
+        let mut buf: Vec<u8> = Vec::new();
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
+
+        if self.eviction_state == EvictionState::Present {
+            // temp hack for forward compatibility
+            const PREV_FORMAT_VERSION: u32 = 8;
+            let prev = downgrade_v9_to_v8(self);
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
+            prev.ser_into(&mut buf)?;
+        } else {
+            // otherwise, we write the current format version
+            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+            self.ser_into(&mut buf)?;
+        }
+
+        // calculate checksum before resize
+        let checksum = crc32c::crc32c(&buf);
+        buf.extend_from_slice(&checksum.to_le_bytes());
+        Ok(buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl Storage for FileStorage {
     /// Persists state durably to the underlying storage.
@@ -180,24 +204,8 @@ impl Storage for FileStorage {
                 &control_partial_path
             )
         })?;
-        let mut buf: Vec<u8> = Vec::new();
-        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
 
-        if s.eviction_state == EvictionState::Present {
-            // temp hack for forward compatibility
-            const PREV_FORMAT_VERSION: u32 = 8;
-            let prev = downgrade_v9_to_v8(s);
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
-            prev.ser_into(&mut buf)?;
-        } else {
-            // otherwise, we write the current format version
-            WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
-            s.ser_into(&mut buf)?;
-        }
-
-        // calculate checksum before resize
-        let checksum = crc32c::crc32c(&buf);
-        buf.extend_from_slice(&checksum.to_le_bytes());
+        let buf: Vec<u8> = s.write_to_buf()?;
 
         control_partial.write_all(&buf).await.with_context(|| {
             format!(
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
index 0bb31c200d..c56f7880d4 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -10,7 +10,7 @@
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
     http::error::HttpErrorBody,
-    id::{TenantId, TimelineId},
+    id::{NodeId, TenantId, TimelineId},
     logging::SecretString,
 };
 
@@ -97,10 +97,11 @@ impl Client {
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        stream_to: NodeId,
     ) -> Result<reqwest::Response> {
         let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/snapshot",
-            self.mgmt_api_endpoint, tenant_id, timeline_id
+            "{}/v1/tenant/{}/timeline/{}/snapshot/{}",
+            self.mgmt_api_endpoint, tenant_id, timeline_id, stream_to.0
         );
         self.get(&uri).await
     }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index fe6d325cee..c9defb0bcf 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -205,6 +205,7 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
 
 /// Stream tar archive with all timeline data.
 async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let destination = parse_request_param(&request, "destination_id")?;
     let ttid = TenantTimelineId::new(
         parse_request_param(&request, "tenant_id")?,
         parse_request_param(&request, "timeline_id")?,
@@ -225,7 +226,13 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     // so create the chan and write to it in another task.
     let (tx, rx) = mpsc::channel(1);
 
-    task::spawn(pull_timeline::stream_snapshot(tli, tx));
+    let conf = get_conf(&request);
+    task::spawn(pull_timeline::stream_snapshot(
+        tli,
+        conf.my_id,
+        destination,
+        tx,
+    ));
 
     let rx_stream = ReceiverStream::new(rx);
     let body = Body::wrap_stream(rx_stream);
@@ -565,7 +572,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             request_span(r, tenant_delete_handler)
         })
         .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot",
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
             |r| request_span(r, timeline_snapshot_handler),
         )
         .post("/v1/pull_timeline", |r| {
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 618c6b278f..1eacec9981 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -11,13 +11,8 @@ use std::{
     io::{self, ErrorKind},
     sync::Arc,
 };
-use tokio::{
-    fs::{File, OpenOptions},
-    io::AsyncWrite,
-    sync::mpsc,
-    task,
-};
-use tokio_tar::{Archive, Builder};
+use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
+use tokio_tar::{Archive, Builder, Header};
 use tokio_util::{
     io::{CopyToBytes, SinkWriter},
     sync::PollSender,
@@ -32,13 +27,15 @@ use crate::{
         routes::TimelineStatus,
     },
     safekeeper::Term,
+    state::TimelinePersistentState,
     timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline},
+    wal_backup,
     wal_storage::{self, open_wal_file, Storage},
     GlobalTimelines, SafeKeeperConf,
 };
 use utils::{
     crashsafe::{durable_rename, fsync_async_opt},
-    id::{TenantId, TenantTimelineId, TimelineId},
+    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     logging::SecretString,
     lsn::Lsn,
     pausable_failpoint,
@@ -46,8 +43,13 @@ use utils::{
 
 /// Stream tar archive of timeline to tx.
 #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
-pub async fn stream_snapshot(tli: WalResidentTimeline, tx: mpsc::Sender<Result<Bytes>>) {
-    if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await {
+pub async fn stream_snapshot(
+    tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
+    tx: mpsc::Sender<Result<Bytes>>,
+) {
+    if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
         // Error type/contents don't matter as they won't can't reach the client
         // (hyper likely doesn't do anything with it), but http stream will be
         // prematurely terminated. It would be nice to try to send the error in
@@ -81,6 +83,8 @@ impl Drop for SnapshotContext {
 
 pub async fn stream_snapshot_guts(
     tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
 ) -> Result<()> {
     // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
@@ -104,7 +108,7 @@ pub async fn stream_snapshot_guts(
     // which is also likely suboptimal.
     let mut ar = Builder::new_non_terminated(pinned_writer);
 
-    let bctx = tli.start_snapshot(&mut ar).await?;
+    let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
     pausable_failpoint!("sk-snapshot-after-list-pausable");
 
     let tli_dir = tli.get_timeline_dir();
@@ -158,13 +162,43 @@ impl WalResidentTimeline {
     async fn start_snapshot<W: AsyncWrite + Unpin + Send>(
         &self,
         ar: &mut tokio_tar::Builder<W>,
+        source: NodeId,
+        destination: NodeId,
     ) -> Result<SnapshotContext> {
         let mut shared_state = self.write_shared_state().await;
         let wal_seg_size = shared_state.get_wal_seg_size();
 
-        let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME);
-        let mut cf = File::open(cf_path).await?;
-        ar.append_file(CONTROL_FILE_NAME, &mut cf).await?;
+        let mut control_store = TimelinePersistentState::clone(shared_state.sk.state());
+        // Modify the partial segment of the in-memory copy for the control file to
+        // point to the destination safekeeper.
+        let replace = control_store
+            .partial_backup
+            .replace_uploaded_segment(source, destination)?;
+
+        if let Some(replace) = replace {
+            // The deserialized control file has an uploaded partial. We upload a copy
+            // of it to object storage for the destination safekeeper and send an updated
+            // control file in the snapshot.
+            tracing::info!(
+                "Replacing uploaded partial segment in in-mem control file: {replace:?}"
+            );
+
+            let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?;
+            wal_backup::copy_partial_segment(
+                &replace.previous.remote_path(&remote_timeline_path),
+                &replace.current.remote_path(&remote_timeline_path),
+            )
+            .await?;
+        }
+
+        let buf = control_store
+            .write_to_buf()
+            .with_context(|| "failed to serialize control store")?;
+        let mut header = Header::new_gnu();
+        header.set_size(buf.len().try_into().expect("never breaches u64"));
+        ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
+            .await
+            .with_context(|| "failed to append to archive")?;
 
         // We need to stream since the oldest segment someone (s3 or pageserver)
         // still needs. This duplicates calc_horizon_lsn logic.
@@ -342,7 +376,7 @@ async fn pull_timeline(
     let client = Client::new(host.clone(), sk_auth_token.clone());
     // Request stream with basebackup archive.
     let bb_resp = client
-        .snapshot(status.tenant_id, status.timeline_id)
+        .snapshot(status.tenant_id, status.timeline_id, conf.my_id)
         .await?;
 
     // Make Stream of Bytes from it...
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 234273e133..aa1a6696a1 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -483,6 +483,16 @@ pub(crate) async fn backup_partial_segment(
         .await
 }
 
+pub(crate) async fn copy_partial_segment(
+    source: &RemotePath,
+    destination: &RemotePath,
+) -> Result<()> {
+    let storage = get_configured_remote_storage();
+    let cancel = CancellationToken::new();
+
+    storage.copy_object(source, destination, &cancel).await
+}
+
 pub async fn read_object(
     file_path: &RemotePath,
     offset: u64,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 52765b0e98..675a051887 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -17,14 +17,13 @@
 //! file. Code updates state in the control file before doing any S3 operations.
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
 use tracing::{debug, error, info, instrument, warn};
-use utils::lsn::Lsn;
+use utils::{id::NodeId, lsn::Lsn};
 
 use crate::{
     metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
@@ -82,6 +81,12 @@ pub struct State {
     pub segments: Vec<PartialRemoteSegment>,
 }
 
+#[derive(Debug)]
+pub(crate) struct ReplaceUploadedSegment {
+    pub(crate) previous: PartialRemoteSegment,
+    pub(crate) current: PartialRemoteSegment,
+}
+
 impl State {
     /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
     pub(crate) fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
@@ -90,6 +95,54 @@ impl State {
             .find(|seg| seg.status == UploadStatus::Uploaded)
             .cloned()
     }
+
+    /// Replace the name of the Uploaded segment (if one exists) in order to match
+    /// it with `destination` safekeeper. Returns a description of the change or None
+    /// wrapped in anyhow::Result.
+    pub(crate) fn replace_uploaded_segment(
+        &mut self,
+        source: NodeId,
+        destination: NodeId,
+    ) -> anyhow::Result<Option<ReplaceUploadedSegment>> {
+        let current = self
+            .segments
+            .iter_mut()
+            .find(|seg| seg.status == UploadStatus::Uploaded);
+
+        let current = match current {
+            Some(some) => some,
+            None => {
+                return anyhow::Ok(None);
+            }
+        };
+
+        // Sanity check that the partial segment we are replacing is belongs
+        // to the `source` SK.
+        if !current
+            .name
+            .ends_with(format!("sk{}.partial", source.0).as_str())
+        {
+            anyhow::bail!(
+                "Partial segment name ({}) doesn't match self node id ({})",
+                current.name,
+                source
+            );
+        }
+
+        let previous = current.clone();
+
+        let new_name = current.name.replace(
+            format!("_sk{}", source.0).as_str(),
+            format!("_sk{}", destination.0).as_str(),
+        );
+
+        current.name = new_name;
+
+        anyhow::Ok(Some(ReplaceUploadedSegment {
+            previous,
+            current: current.clone(),
+        }))
+    }
 }
 
 struct PartialBackup {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index aaa1f21997..b76432127d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -67,6 +67,7 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
+    LocalFsStorage,
     MockS3Server,
     RemoteStorage,
     RemoteStorageKind,
@@ -4425,14 +4426,32 @@ class Safekeeper(LogUtils):
     def timeline_dir(self, tenant_id, timeline_id) -> Path:
         return self.data_dir / str(tenant_id) / str(timeline_id)
 
+    def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId):
+        tline_path = (
+            self.env.repo_dir
+            / "local_fs_remote_storage"
+            / "safekeeper"
+            / str(tenant_id)
+            / str(timeline_id)
+        )
+        assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage)
+        return self._list_segments_in_dir(
+            tline_path, lambda name: ".metadata" not in name and ".___temp" not in name
+        )
+
     def list_segments(self, tenant_id, timeline_id) -> List[str]:
         """
         Get list of segment names of the given timeline.
         """
         tli_dir = self.timeline_dir(tenant_id, timeline_id)
+        return self._list_segments_in_dir(
+            tli_dir, lambda name: not name.startswith("safekeeper.control")
+        )
+
+    def _list_segments_in_dir(self, path: Path, keep_filter: Callable[[str], bool]) -> list[str]:
         segments = []
-        for _, _, filenames in os.walk(tli_dir):
-            segments.extend([f for f in filenames if not f.startswith("safekeeper.control")])
+        for _, _, filenames in os.walk(path):
+            segments.extend([f for f in filenames if keep_filter(f)])
         segments.sort()
         return segments
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index bf7829fc84..5d3b263936 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -49,7 +49,13 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background
+from fixtures.utils import (
+    PropagatingThread,
+    get_dir_size,
+    query_scalar,
+    start_in_background,
+    wait_until,
+)
 
 
 def wait_lsn_force_checkpoint(
@@ -63,6 +69,18 @@ def wait_lsn_force_checkpoint(
     lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
     log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
 
+    wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
+def wait_lsn_force_checkpoint_at(
+    lsn: Lsn,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    pageserver_conn_options = pageserver_conn_options or {}
+
     auth_token = None
     if "password" in pageserver_conn_options:
         auth_token = pageserver_conn_options["password"]
@@ -2304,3 +2322,138 @@ def test_s3_eviction(
     )
 
     assert event_metrics_seen
+
+
+def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder):
+    """
+    Verify that pulling timeline from a SK with an uploaded partial segment
+    does not lead to consistency issues:
+    1. Start 3 SKs - only use two
+    2. Ingest a bit of WAL
+    3. Wait for partial to be uploaded
+    4. Pull timeline to the third SK
+    6. Replace source with destination SK and start compute
+    5. Wait for source SK to evict timeline
+    6. Go back to initial compute SK config and validate that
+    source SK can unevict the timeline (S3 state is consistent)
+    """
+    neon_env_builder.auth_enabled = True
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--delete-offloaded-wal",
+        "--partial-backup-timeout",
+        "500ms",
+        "--control-file-save-interval",
+        "500ms",
+        "--eviction-min-resident=500ms",
+    ]
+
+    env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"})
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    endpoint = env.endpoints.create("main")
+    endpoint.active_safekeepers = [1, 2]
+    endpoint.start()
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'")
+
+    endpoint.stop()
+
+    def source_partial_segment_uploaded():
+        first_segment_name = "000000010000000000000001"
+        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+
+        candidate_seg = None
+        for seg in segs:
+            if "partial" in seg and "sk1" in seg and not seg.startswith(first_segment_name):
+                candidate_seg = seg
+
+        if candidate_seg is not None:
+            # The term might change, causing the segment to be gc-ed shortly after,
+            # so give it a bit of time to make sure it's stable.
+            time.sleep(2)
+
+            segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+            assert candidate_seg in segs
+            return candidate_seg
+
+        raise Exception("Partial segment not uploaded yet")
+
+    source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded)
+    log.info(
+        f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+    log.info(f"Tracking source partial segment: {source_partial_segment}")
+
+    src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id)
+    log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}")
+
+    pageserver_conn_options = {"password": env.auth_keys.generate_tenant_token(tenant_id)}
+    wait_lsn_force_checkpoint_at(
+        src_flush_lsn, tenant_id, timeline_id, env.pageserver, pageserver_conn_options
+    )
+
+    dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
+
+    def evicted():
+        evictions = src_sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+        )
+
+        if evictions is None or evictions == 0:
+            raise Exception("Eviction did not happen on source safekeeper yet")
+
+    wait_until(30, 1, evicted)
+
+    endpoint.start(safekeepers=[2, 3])
+
+    def new_partial_segment_uploaded():
+        segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+        for seg in segs:
+            if "partial" in seg and "sk3" in seg:
+                return seg
+
+        raise Exception("Partial segment not uploaded yet")
+
+    log.info(
+        f"Uploaded segments before post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+    wait_until(15, 1, new_partial_segment_uploaded)
+
+    log.info(
+        f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    # Allow for some gc iterations to happen and assert that the original
+    # uploaded partial segment remains in place.
+    time.sleep(5)
+    segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
+    assert source_partial_segment in segs
+
+    log.info(
+        f"Uploaded segments at the end are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
+    )
+
+    # Restart the endpoint in order to check that the source safekeeper
+    # can unevict the timeline
+    endpoint.stop()
+    endpoint.start(safekeepers=[1, 2])
+
+    def unevicted():
+        unevictions = src_sk.http_client().get_metric_value(
+            "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+        )
+
+        if unevictions is None or unevictions == 0:
+            raise Exception("Uneviction did not happen on source safekeeper yet")
+
+    wait_until(10, 1, unevicted)

From a9c28be7d02226032f153edf6c7b527aec9fa5db Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 10:06:28 +0100
Subject: [PATCH 02/44] fix(pageserver): allow unused_imports in download.rs on
 macOS (#8733)

## Problem

On macOS, clippy fails with the following error:

```
error: unused import: `crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt`
  --> pageserver/src/tenant/remote_timeline_client/download.rs:26:5
   |
26 | use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   |
   = note: `-D unused-imports` implied by `-D warnings`
   = help: to override `-D warnings` add `#[allow(unused_imports)]`
```

Introduced in https://github.com/neondatabase/neon/pull/8717

## Summary of changes
- allow `unused_imports` for
`crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt` on macOS
in download.rs
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 8199218c3c..d9725ad756 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,6 +23,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
+#[cfg_attr(target_os = "macos", allow(unused_imports))]
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;

From d9a57aeed9ca9b0e2134e7183355d52fb6a089d1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 12:54:05 +0300
Subject: [PATCH 03/44] storcon: deny external node configuration if an
 operation is ongoing (#8727)

Per #8674, disallow node configuration while drain/fill are ongoing.
Implement it by adding a only-http wrapper
`Service::external_node_configure` which checks for operation existing
before configuring.

Additionally:
- allow cancelling drain/fill after a pageserver has restarted and
transitioned to WarmingUp

Fixes: #8674
---
 libs/pageserver_api/src/controller_api.rs     |  3 --
 storage_controller/src/http.rs                |  2 +-
 storage_controller/src/service.rs             | 42 +++++++++++-------
 .../regress/test_storage_controller.py        | 44 +++++++++++++++++++
 4 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a5b452da83..a50707a1b8 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -313,20 +313,17 @@ pub struct MetadataHealthUpdateRequest {
 pub struct MetadataHealthUpdateResponse {}
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListUnhealthyResponse {
     pub unhealthy_tenant_shards: Vec<TenantShardId>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListOutdatedRequest {
     #[serde(with = "humantime_serde")]
     pub not_scrubbed_for: Duration,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
-
 pub struct MetadataHealthListOutdatedResponse {
     pub health_records: Vec<MetadataHealthRecord>,
 }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e8513b31eb..e755aaed19 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -500,7 +500,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
         StatusCode::OK,
         state
             .service
-            .node_configure(
+            .external_node_configure(
                 config_req.node_id,
                 config_req.availability.map(NodeAvailability::from),
                 config_req.scheduling,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ef4cd91efd..d717924ae6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4912,6 +4912,26 @@ impl Service {
         Ok(())
     }
 
+    /// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing
+    /// operation for HTTP api.
+    pub(crate) async fn external_node_configure(
+        &self,
+        node_id: NodeId,
+        availability: Option<NodeAvailability>,
+        scheduling: Option<NodeSchedulingPolicy>,
+    ) -> Result<(), ApiError> {
+        {
+            let locked = self.inner.read().unwrap();
+            if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Ongoing background operation forbids configuring: {op}").into(),
+                ));
+            }
+        }
+
+        self.node_configure(node_id, availability, scheduling).await
+    }
+
     pub(crate) async fn start_node_drain(
         self: &Arc<Self>,
         node_id: NodeId,
@@ -5017,14 +5037,14 @@ impl Service {
     }
 
     pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
+        let node_available = {
             let locked = self.inner.read().unwrap();
             let nodes = &locked.nodes;
             let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
                 anyhow::anyhow!("Node {} not registered", node_id).into(),
             ))?;
 
-            (node.is_available(), node.get_scheduling())
+            node.is_available()
         };
 
         if !node_available {
@@ -5033,12 +5053,6 @@ impl Service {
             ));
         }
 
-        if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no drain in progress").into(),
-            ));
-        }
-
         if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
             if let Operation::Drain(drain) = op_handler.operation {
                 if drain.node_id == node_id {
@@ -5152,14 +5166,14 @@ impl Service {
     }
 
     pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
-        let (node_available, node_policy) = {
+        let node_available = {
             let locked = self.inner.read().unwrap();
             let nodes = &locked.nodes;
             let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
                 anyhow::anyhow!("Node {} not registered", node_id).into(),
             ))?;
 
-            (node.is_available(), node.get_scheduling())
+            node.is_available()
         };
 
         if !node_available {
@@ -5168,12 +5182,6 @@ impl Service {
             ));
         }
 
-        if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
-            return Err(ApiError::PreconditionFailed(
-                format!("Node {node_id} has no fill in progress").into(),
-            ));
-        }
-
         if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
             if let Operation::Fill(fill) = op_handler.operation {
                 if fill.node_id == node_id {
@@ -5982,7 +5990,7 @@ impl Service {
                 .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
                 .await;
 
-            failpoint_support::sleep_millis_async!("sleepy-drain-loop");
+            failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
         }
 
         while !waiters.is_empty() {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9b2557a165..7d98ff2923 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2091,3 +2091,47 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
         )
         == 0
     )
+
+
+def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
+    # single unsharded tenant, two locations
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_start()
+
+    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
+    env.storage_controller.reconcile_until_idle()
+
+    attached_id = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"])
+    attached = next((ps for ps in env.pageservers if ps.id == attached_id))
+
+    def attached_is_draining():
+        details = env.storage_controller.node_status(attached.id)
+        assert details["scheduling"] == "Draining"
+
+    env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
+    env.storage_controller.node_drain(attached.id)
+
+    wait_until(10, 0.5, attached_is_draining)
+
+    attached.restart()
+
+    # we are unable to reconfigure node while the operation is still ongoing
+    with pytest.raises(
+        StorageControllerApiException,
+        match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
+    ):
+        env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
+    with pytest.raises(
+        StorageControllerApiException,
+        match="Precondition failed: Ongoing background operation forbids configuring: drain.*",
+    ):
+        env.storage_controller.node_configure(attached.id, {"availability": "Offline"})
+
+    env.storage_controller.cancel_node_drain(attached.id)
+
+    def reconfigure_node_again():
+        env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
+
+    # allow for small delay between actually having cancelled and being able reconfigure again
+    wait_until(4, 0.5, reconfigure_node_again)

From 52641eb8533ec0bdd70523f2595a0265c9208dc7 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 15:30:04 +0300
Subject: [PATCH 04/44] storcon: add spans to drain/fill ops (#8735)

this way we do not need to repeat the %node_id everywhere, and we get no
stray messages in logs from within the op.
---
 storage_controller/src/service.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d717924ae6..84db088a42 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4989,6 +4989,8 @@ impl Service {
                     cancel: cancel.clone(),
                 });
 
+                let span = tracing::info_span!(parent: None, "drain_node", %node_id);
+
                 tokio::task::spawn({
                     let service = self.clone();
                     let cancel = cancel.clone();
@@ -5005,21 +5007,21 @@ impl Service {
                             }
                         }
 
-                        tracing::info!(%node_id, "Drain background operation starting");
+                        tracing::info!("Drain background operation starting");
                         let res = service.drain_node(node_id, cancel).await;
                         match res {
                             Ok(()) => {
-                                tracing::info!(%node_id, "Drain background operation completed successfully");
+                                tracing::info!("Drain background operation completed successfully");
                             }
                             Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Drain background operation was cancelled");
+                                tracing::info!("Drain background operation was cancelled");
                             }
                             Err(err) => {
-                                tracing::error!(%node_id, "Drain background operation encountered: {err}")
+                                tracing::error!("Drain background operation encountered: {err}")
                             }
                         }
                     }
-                });
+                }.instrument(span));
             }
             NodeSchedulingPolicy::Draining => {
                 return Err(ApiError::Conflict(format!(
@@ -5118,6 +5120,8 @@ impl Service {
                     cancel: cancel.clone(),
                 });
 
+                let span = tracing::info_span!(parent: None, "fill_node", %node_id);
+
                 tokio::task::spawn({
                     let service = self.clone();
                     let cancel = cancel.clone();
@@ -5134,21 +5138,21 @@ impl Service {
                             }
                         }
 
-                        tracing::info!(%node_id, "Fill background operation starting");
+                        tracing::info!("Fill background operation starting");
                         let res = service.fill_node(node_id, cancel).await;
                         match res {
                             Ok(()) => {
-                                tracing::info!(%node_id, "Fill background operation completed successfully");
+                                tracing::info!("Fill background operation completed successfully");
                             }
                             Err(OperationError::Cancelled) => {
-                                tracing::info!(%node_id, "Fill background operation was cancelled");
+                                tracing::info!("Fill background operation was cancelled");
                             }
                             Err(err) => {
-                                tracing::error!(%node_id, "Fill background operation encountered: {err}")
+                                tracing::error!("Fill background operation encountered: {err}")
                             }
                         }
                     }
-                });
+                }.instrument(span));
             }
             NodeSchedulingPolicy::Filling => {
                 return Err(ApiError::Conflict(format!(

From 24d347f50b15bb8ba44f0b25589e180e6482e1a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 15 Aug 2024 16:27:07 +0300
Subject: [PATCH 05/44] storcon: use tracing for logging panics (#8734)

this gives spans for panics, and does not globber loki output by writing
to stderr while all of the other logging is to stdout.

See: #3475
---
 storage_controller/src/main.rs | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 5a68799141..7387d36690 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -196,14 +196,26 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 
 fn main() -> anyhow::Result<()> {
-    let default_panic = std::panic::take_hook();
-    std::panic::set_hook(Box::new(move |info| {
-        default_panic(info);
-        std::process::exit(1);
-    }));
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
+    )?;
+
+    // log using tracing so we don't get confused output by default hook writing to stderr
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
 
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
+    let hook = std::panic::take_hook();
+    std::panic::set_hook(Box::new(move |info| {
+        // let sentry send a message (and flush)
+        // and trace the error
+        hook(info);
+
+        std::process::exit(1);
+    }));
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.
@@ -217,12 +229,6 @@ fn main() -> anyhow::Result<()> {
 async fn async_main() -> anyhow::Result<()> {
     let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
 
-    logging::init(
-        LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-        logging::Output::Stdout,
-    )?;
-
     preinitialize_metrics();
 
     let args = Cli::parse();

From f087423a0111d4fb5ac1e12007447c56b2a1c2a6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 15 Aug 2024 16:28:25 +0300
Subject: [PATCH 06/44] Handle reload config file request in LR monitor (#8732)

## Problem

Logical replication BGW checking replication lag is not reloading config

## Summary of changes

Add handling of reload config request

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 784d0f1da3..fe8e276d1c 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -192,6 +192,13 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	{
 		XLogRecPtr	cutoff_lsn;
 
+		/* In case of a SIGHUP, just reload the configuration. */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
 		/*
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.

From 4e58fd93216c5274e49488de161dc9ce12abd82d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 18:37:15 +0100
Subject: [PATCH 07/44] CI(label-for-external-users): use CI_ACCESS_TOKEN
 (#8738)

## Problem

`secrets.GITHUB_TOKEN` (with any permissions) is not enough to get
a user's membership info if they decide to hide it.

## Summary of changes
- Use `secrets.CI_ACCESS_TOKEN` for `gh api` call
- Use `pull_request_target` instead of `pull_request` event to get
access to secrets
---
 .github/workflows/label-for-external-users.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml
index 7cf5ee254c..585d118dfb 100644
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -4,7 +4,7 @@ on:
   issues:
     types:
       - opened
-  pull_request:
+  pull_request_target:
     types:
       - opened
 
@@ -25,7 +25,7 @@ jobs:
     - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
       id: check-user
       env:
-        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
       run: |
         if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
           is_member=true
@@ -45,10 +45,10 @@ jobs:
       issues: write        # for `gh issue edit`
 
     steps:
-    - name: Label new ${{ github.event_name }}
+    - name: Add `${{ env.LABEL }}` label
       env:
         GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
-        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
+        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
+        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
       run: |
         gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}

From 69cb1ee479ecdc99dd117fe4149b59dd54676fea Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 15 Aug 2024 22:41:58 +0100
Subject: [PATCH 08/44] CI(replication-tests): store test results & change
 notification channel (#8687)

## Problem

We want to store Nightly Replication test results in the database and
notify the relevant Slack channel about failures

## Summary of changes
- Store test results in the database
- Notify `on-call-compute-staging-stream` about failures
---
 .github/workflows/benchmarking.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index f99a037489..a4a597acde 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -222,13 +222,20 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
+        slack-message: |
+          Periodic replication testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
@@ -330,7 +337,7 @@ jobs:
   prepare_AWS_RDS_databases:
     uses: ./.github/workflows/_benchmarking_preparation.yml
     secrets: inherit
-  
+
   pgbench-compare:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices, prepare_AWS_RDS_databases ]

From df086cd139ee5ecc82bf096fc3fc6ee4397ac983 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 15 Aug 2024 15:34:45 -0700
Subject: [PATCH 09/44] Add logical replication test to exercise snapfiles
 (#8364)

---
 .../performance/test_logical_replication.py   | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 4b4ffc1fee..c4e42a7834 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -262,3 +262,85 @@ def test_publisher_restart(
             sub_workload.terminate()
     finally:
         pub_workload.terminate()
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_snap_files(
+    pg_bin: PgBin,
+    benchmark_project_pub: NeonApiEndpoint,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a node with a replication slot. Generates pgbench into the replication slot,
+    then runs pgbench inserts while generating large numbers of snapfiles. Then restarts
+    the node and tries to peek the replication changes.
+    """
+    test_duration_min = 60
+    test_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    env = benchmark_project_pub.pgbench_env
+    connstr = benchmark_project_pub.connstr
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
+            is_super = cur.fetchall()[0]
+            assert is_super, "This benchmark won't work if we don't have superuser"
+
+    conn = psycopg2.connect(connstr)
+    conn.autocommit = True
+    cur = conn.cursor()
+    cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute("SELECT pg_reload_conf()")
+
+    with psycopg2.connect(connstr) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                DO $$
+                    BEGIN
+                    IF EXISTS (
+                        SELECT 1
+                        FROM pg_replication_slots
+                        WHERE slot_name = 'slotter'
+                    ) THEN
+                        PERFORM pg_drop_replication_slot('slotter');
+                    END IF;
+                END $$;
+            """
+            )
+            cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+
+    workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
+    try:
+        start = time.time()
+        prev_measurement = time.time()
+        while time.time() - start < test_duration_min * 60:
+            with psycopg2.connect(connstr) as conn:
+                with conn.cursor() as cur:
+                    cur.execute(
+                        "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
+                    )
+                    check_pgbench_still_running(workload)
+                    cur.execute(
+                        "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
+                    )
+
+            # Measure storage
+            if time.time() - prev_measurement > test_interval_min * 60:
+                storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER)
+                prev_measurement = time.time()
+            time.sleep(test_interval_min * 60 / 3)
+
+    finally:
+        workload.terminate()

From 4763a960d103a27250eadd6892368ae77a3d66c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Aug 2024 08:10:05 +0300
Subject: [PATCH 10/44] chore: log if we have an open layer or any frozen on
 shutdown (#8740)

Some benchmarks are failing with a "long" flushing, which might be
because there is a queue of in-memory layers, or something else. Add
logging to narrow it down.

Private slack DM ref:
https://neondb.slack.com/archives/D049K7HJ9JM/p1723727305238099
---
 pageserver/src/tenant/timeline.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b4d908b130..01e77fa1b1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1645,6 +1645,20 @@ impl Timeline {
         self.last_record_lsn.shutdown();
 
         if try_freeze_and_flush {
+            if let Some((open, frozen)) = self
+                .layers
+                .read()
+                .await
+                .layer_map()
+                .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
+                .ok()
+                .filter(|(open, frozen)| *open || *frozen > 0)
+            {
+                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+            } else {
+                // this is double-shutdown, ignore it
+            }
+
             // we shut down walreceiver above, so, we won't add anything more
             // to the InMemoryLayer; freeze it and wait for all frozen layers
             // to reach the disk & upload queue, then shut the upload queue and

From 7fdc3ea16296ae7ac6f74ed2843ecee454391276 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 16 Aug 2024 13:30:53 +0300
Subject: [PATCH 11/44] Add retroactive RFC about physical replication (#8546)

We've had physical replication support for a long time, but we never
created an RFC for the feature. This RFC does that after the fact. Even
though we've already implemented the feature, let's have a design
discussion as if it hadn't done that. It can still be quite insightful.

This is written from a pretty compute-centric viewpoint, not much
on how it works in the control plane.
---
 docs/rfcs/036-physical-replication.md | 265 ++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 docs/rfcs/036-physical-replication.md

diff --git a/docs/rfcs/036-physical-replication.md b/docs/rfcs/036-physical-replication.md
new file mode 100644
index 0000000000..41aced0545
--- /dev/null
+++ b/docs/rfcs/036-physical-replication.md
@@ -0,0 +1,265 @@
+# Physical Replication
+
+This RFC is a bit special in that we have already implemented physical
+replication a long time ago. However, we never properly wrote down all
+the decisions and assumptions, and in the last months when more users
+have started to use the feature, numerous issues have surfaced.
+
+This RFC documents the design decisions that have been made.
+
+## Summary
+
+PostgreSQL has a feature called streaming replication, where a replica
+streams WAL from the primary and continuously applies it. It is also
+known as "physical replication", to distinguish it from logical
+replication.  In PostgreSQL, a replica is initialized by taking a
+physical backup of the primary. In Neon, the replica is initialized
+from a slim "base backup" from the pageserver, just like a primary,
+and the primary and the replicas connect to the same pageserver,
+sharing the storage.
+
+There are two kinds of read-only replicas in Neon:
+- replicas that follow the primary, and
+- "static" replicas that are pinned at a particular LSN.
+
+A static replica is useful e.g. for performing time-travel queries and
+running one-off slow queries without affecting the primary. A replica
+that follows the primary can be used e.g. to scale out read-only
+workloads.
+
+## Motivation
+
+Read-only replicas allow offloading read-only queries. It's useful for
+isolation, if you want to make sure that read-only queries don't
+affect the primary, and it's also an easy way to provide guaranteed
+read-only access to an application, without having to mess with access
+controls.
+
+## Non Goals (if relevant)
+
+This RFC is all about WAL-based *physical* replication. Logical
+replication is a different feature.
+
+Neon also has the capability to launch "static" read-only nodes which
+do not follow the primary, but are pinned to a particular LSN. They
+can be used for long-running one-off queries, or for Point-in-time
+queries. They work similarly to read replicas that follow the primary,
+but some things are simpler: there are no concerns about cache
+invalidation when the data changes on the primary, or worrying about
+transactions that are in-progress on the primary.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+- Control plane launches the replica
+- Replica Postgres instance connects to the safekeepers, to stream the WAL
+- The primary does not know about the standby, except for the hot standby feedback
+- The primary and replicas all connect to the same pageservers
+
+
+# Context
+
+Some useful things to know about hot standby and replicas in
+PostgreSQL.
+
+## PostgreSQL startup sequence
+
+"Running" and "start up" terms are little imprecise. PostgreSQL
+replica startup goes through several stages:
+
+1. First, the process is started up, and various initialization steps
+   are performed, like initializing shared memory. If you try to
+   connect to the server in this stage, you get an error: ERROR: the
+   database system is starting up. This stage happens very quickly, no
+
+2. Then the server reads the checpoint record from the WAL and starts
+   the WAL replay starting from the checkpoint. This works differently
+   in Neon: we start the WAL replay at the basebackup LSN, not from a
+   checkpoint! If you connect to the server in this state, you get an
+   error: ERROR: the database system is not yet accepting
+   connections. We proceed to the next stage, when the WAL replay sees
+   a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
+   can allow us to move directly to next stage, with all the caveats
+   listed in this RFC.
+
+3. When the running-xacts information is established, the server
+   starts to accept connections normally.
+
+From PostgreSQL's point of view, the server is already running in
+stage 2, even though it's not accepting connections yet. Our
+`compute_ctl` does not consider it as running until stage 3. If the
+transition from stage 2 to 3 doesn't happen fast enough, the control
+plane will mark the start operation as failed.
+
+
+## Decisions, Issues
+
+### Cache invalidation in replica
+
+When a read replica follows the primary in PostgreSQL, it needs to
+stream all the WAL from the primary and apply all the records, to keep
+the local copy of the data consistent with the primary. In Neon, the
+replica can fetch the updated page versions from the pageserver, so
+it's not necessary to apply all the WAL. However, it needs to ensure
+that any pages that are currently in the Postgres buffer cache, or the
+Local File Cache, are either updated, or thrown away so that the next
+read of the page will fetch the latest version.
+
+We choose to apply the WAL records for pages that are already in the
+buffer cache, and skip records for other pages. Somewhat arbitrarily,
+we also apply records affecting catalog relations, fetching the old
+page version from the pageserver if necessary first. See
+`neon_redo_read_buffer_filter()` function.
+
+The replica wouldn't necessarily need to see all the WAL records, only
+the records that apply to cached pages. For simplicity, we do stream
+all the WAL to the replica, and the replica simply ignores WAL records
+that require no action.
+
+Like in PostgreSQL, the read replica maintains a "replay LSN", which
+is the LSN up to which the replica has received and replayed the
+WAL. The replica can lag behind the primary, if it cannot quite keep
+up with the primary, or if a long-running query conflicts with changes
+that are about to be applied, or even intentionally if the user wishes
+to see delayed data (see recovery_min_apply_delay). It's important
+that the replica sees a consistent view of the whole cluster at the
+replay LSN, when it's lagging behind.
+
+In Neon, the replica connects to a safekeeper to get the WAL
+stream. That means that the safekeepers must be able to regurgitate
+the original WAL as far back as the replay LSN of any running read
+replica. (A static read-only node that does not follow the primary
+does not require a WAL stream however). The primary does not need to
+be running, and when it is, the replicas don't incur any extra
+overhead to the primary (see hot standby feedback though).
+
+### In-progress transactions
+
+In PostgreSQL, when a hot standby server starts up, it cannot
+immediately open up for queries (see [PostgreSQL startup
+sequence]). It first needs to establish a complete list of in-progress
+transactions, including subtransactions, that are running at the
+primary, at the current replay LSN. Normally that happens quickly,
+when the replica sees a "running-xacts" WAL record, because the
+primary writes a running-xacts WAL record at every checkpoint, and in
+PostgreSQL the replica always starts the WAL replay from a checkpoint
+REDO point. (A shutdown checkpoint WAL record also implies that all
+the non-prepared transactions have ended.) If there are a lot of
+subtransactions in progress, however, the standby might need to wait
+for old transactions to complete before it can open up for queries.
+
+In Neon that problem is worse: a replica can start at any LSN, so
+there's no guarantee that it will see a running-xacts record any time
+soon. In particular, if the primary is not running when the replica is
+started, it might never see a running-xacts record.
+
+To make things worse, we initially missed this issue, and always
+started accepting queries at replica startup, even if it didn't have
+the transaction information. That could lead to incorrect query
+results and data corruption later. However, as we fixed that, we
+introduced a new problem compared to what we had before: previously
+the replica would always start up, but after fixing that bug, it might
+not. In a superficial way, the old behavior was better (but could lead
+to serious issues later!). That made fixing that bug was very hard,
+because as we fixed it, we made things (superficially) worse for
+others.
+
+See https://github.com/neondatabase/neon/pull/7288 which fixed the
+bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
+and https://github.com/neondatabase/neon/pull/8484 to try to claw back
+the cases that started to cause trouble as fixing it. As of this
+writing, there are still cases where a replica might not immediately
+start up, causing the control plane operation to fail, the remaining
+issues are tracked in https://github.com/neondatabase/neon/issues/6211.
+
+One long-term fix for this is to switch to using so-called CSN
+snapshots in read replica. That would make it unnecessary to have the
+full in-progress transaction list in the replica at startup time. See
+https://commitfest.postgresql.org/48/4912/ for a work-in-progress
+patch to upstream to implement that.
+
+Another thing we could do is to teach the control plane about that
+distinction between "starting up" and "running but haven't received
+running-xacts information yet", so that we could keep the replica
+waiting longer in that stage, and also give any client connections the
+same `ERROR: the database system is not yet accepting connections`
+error that you get in standalone PostgreSQL in that state.
+
+
+### Recovery conflicts and Hot standby feedback
+
+It's possible that a tuple version is vacuumed away in the primary,
+even though it is still needed by a running transactions in the
+replica. This is called a "recovery conflict", and PostgreSQL provides
+various options for dealing with it. By default, the WAL replay will
+wait up to 30 s for the conflicting query to finish. After that, it
+will kill the running query, so that the WAL replay can proceed.
+
+Another way to avoid the situation is to enable the
+[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
+option. When it is enabled, the primary will refrain from vacuuming
+tuples that are still needed in the primary. That means potentially
+bloating the primary, which violates the usual rule that read replicas
+don't affect the operations on the primary, which is why it's off by
+default. We leave it to users to decide if they want to turn it on,
+same as PostgreSQL.
+
+Neon supports `hot_standby_feedback` by passing the feedback messages
+from the replica to the safekeepers, and from safekeepers to the
+primary.
+
+### Relationship of settings between primary and replica
+
+In order to enter hot standby mode, some configuration options need to
+be set to the same or larger values in the standby, compared to the
+primary.  See [explanation in the PostgreSQL
+docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
+
+In Neon, we have this problem too. To prevent customers from hitting
+it, the control plane automatically adjusts the settings of a replica,
+so that they match or exceed the primary's settings (see
+https://github.com/neondatabase/cloud/issues/14903). However, you
+can still hit the issue if the primary is restarted with larger
+settings, while the replica is running.
+
+
+### Interaction with Pageserver GC
+
+The read replica can lag behind the primary. If there are recovery
+conflicts or the replica cannot keep up for some reason, the lag can
+in principle grow indefinitely. The replica will issue all GetPage
+requests to the pageservers at the current replay LSN, and needs to
+see the old page versions.
+
+If the retention period in the pageserver is set to be small, it may
+have already garbage collected away the old page versions. That will
+cause read errors in the compute, and can mean that the replica cannot
+make progress with the replication anymore.
+
+There is a mechanism for replica to pass information about its replay
+LSN to the pageserver, so that the pageserver refrains from GC'ing
+data that is still needed by the standby. It's called
+'standby_horizon' in the pageserver code, see
+https://github.com/neondatabase/neon/pull/7368. A separate "lease"
+mechanism also is in the works, where the replica could hold a lease
+on the old LSN, preventing the pageserver from advancing the GC
+horizon past that point. The difference is that the standby_horizon
+mechanism relies on a feedback message from replica to safekeeper,
+while the least API is exposed directly from the pageserver. A static
+read-only node is not connected to safekeepers, so it cannot use the
+standby_horizon mechanism.
+
+
+### Synchronous replication
+
+We haven't put any effort into synchronous replication yet.
+
+PostgreSQL provides multiple levels of synchronicity. In the weaker
+levels, a transaction is not acknowledged as committed to the client
+in the primary until the WAL has been streamed to a replica or flushed
+to disk there. Those modes don't make senses in Neon, because the
+safekeepers handle durability.
+
+`synchronous_commit=remote_apply` mode would make sense. In that mode,
+the commit is not acknowledged to the client until it has been
+replayed in the replica. That ensures that after commit, you can see
+the commit in the replica too (aka. read-your-write consistency).

From 3f91ea28d997a23b899ef0c3ce237e7ae85f2916 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 16 Aug 2024 13:05:04 +0100
Subject: [PATCH 12/44] tests: add infra and test for storcon leadership
 transfer (#8587)

## Problem
https://github.com/neondatabase/neon/pull/8588 implemented the mechanism
for storage controller
leadership transfers. However, there's no tests that exercise the
behaviour.

## Summary of changes
1. Teach `neon_local` how to handle multiple storage controller
instances. Each storage controller
instance gets its own subdirectory (`storage_controller_1, ...`).
`storage_controller start|stop` subcommands
have also been extended to optionally accept an instance id.
2. Add a storage controller proxy test fixture. It's a basic HTTP server
that forwards requests from pageserver
and test env to the currently configured storage controller.
3. Add a test which exercises storage controller leadership transfer.
4. Finally fix a couple bugs that the test surfaced
---
 control_plane/src/background_process.rs       |   2 +-
 control_plane/src/bin/neon_local.rs           |  86 +++-
 control_plane/src/local_env.rs                |  37 ++
 control_plane/src/storage_controller.rs       | 396 ++++++++++++------
 storage_controller/src/http.rs                |  16 +
 storage_controller/src/peer_client.rs         |   4 +-
 storage_controller/src/service.rs             | 114 ++---
 test_runner/conftest.py                       |   1 +
 test_runner/fixtures/neon_fixtures.py         | 232 +++++++---
 .../fixtures/storage_controller_proxy.py      |  73 ++++
 test_runner/fixtures/utils.py                 |   2 +-
 .../regress/test_storage_controller.py        | 129 ++++++
 12 files changed, 841 insertions(+), 251 deletions(-)
 create mode 100644 test_runner/fixtures/storage_controller_proxy.py

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index bf8a27e550..619c5bce3e 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -379,7 +379,7 @@ where
     }
 }
 
-fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
     match kill(pid, None) {
         // Process exists, keep waiting
         Ok(_) => Ok(false),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 51e9a51a57..edd88dc71c 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,7 +15,9 @@ use control_plane::local_env::{
 };
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage_controller::StorageController;
+use control_plane::storage_controller::{
+    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
+};
 use control_plane::{broker, local_env};
 use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -1052,6 +1054,36 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
     humantime_duration.as_ref()
 }
 
+fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
+    let maybe_instance_id = args.get_one::<u8>("instance-id");
+
+    let base_port = args.get_one::<u16>("base-port");
+
+    if maybe_instance_id.is_some() && base_port.is_none() {
+        panic!("storage-controller start specificied instance-id but did not provide base-port");
+    }
+
+    let start_timeout = args
+        .get_one::<humantime::Duration>("start-timeout")
+        .expect("invalid value for start-timeout");
+
+    NeonStorageControllerStartArgs {
+        instance_id: maybe_instance_id.copied().unwrap_or(1),
+        base_port: base_port.copied(),
+        start_timeout: *start_timeout,
+    }
+}
+
+fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
+    let maybe_instance_id = args.get_one::<u8>("instance-id");
+    let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
+
+    NeonStorageControllerStopArgs {
+        instance_id: maybe_instance_id.copied().unwrap_or(1),
+        immediate,
+    }
+}
+
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
@@ -1113,19 +1145,14 @@ async fn handle_storage_controller(
     let svc = StorageController::from_env(env);
     match sub_match.subcommand() {
         Some(("start", start_match)) => {
-            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
+            if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
                 eprintln!("start failed: {e}");
                 exit(1);
             }
         }
 
         Some(("stop", stop_match)) => {
-            let immediate = stop_match
-                .get_one::<String>("stop-mode")
-                .map(|s| s.as_str())
-                == Some("immediate");
-
-            if let Err(e) = svc.stop(immediate).await {
+            if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
@@ -1228,7 +1255,12 @@ async fn handle_start_all(
     // Only start the storage controller if the pageserver is configured to need it
     if env.control_plane_api.is_some() {
         let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.start(retry_timeout).await {
+        if let Err(e) = storage_controller
+            .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                (*retry_timeout).into(),
+            ))
+            .await
+        {
             eprintln!("storage_controller start failed: {:#}", e);
             try_stop_all(env, true).await;
             exit(1);
@@ -1358,10 +1390,21 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
         eprintln!("neon broker stop failed: {e:#}");
     }
 
-    if env.control_plane_api.is_some() {
+    // Stop all storage controller instances. In the most common case there's only one,
+    // but iterate though the base data directory in order to discover the instances.
+    let storcon_instances = env
+        .storage_controller_instances()
+        .await
+        .expect("Must inspect data dir");
+    for (instance_id, _instance_dir_path) in storcon_instances {
         let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller.stop(immediate).await {
-            eprintln!("storage controller stop failed: {e:#}");
+        let stop_args = NeonStorageControllerStopArgs {
+            instance_id,
+            immediate,
+        };
+
+        if let Err(e) = storage_controller.stop(stop_args).await {
+            eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
         }
     }
 }
@@ -1501,6 +1544,18 @@ fn cli() -> Command {
         .action(ArgAction::SetTrue)
         .required(false);
 
+    let instance_id = Arg::new("instance-id")
+        .long("instance-id")
+        .help("Identifier used to distinguish storage controller instances (default 1)")
+        .value_parser(value_parser!(u8))
+        .required(false);
+
+    let base_port = Arg::new("base-port")
+        .long("base-port")
+        .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
+        .value_parser(value_parser!(u16))
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1609,9 +1664,12 @@ fn cli() -> Command {
                 .arg_required_else_help(true)
                 .about("Manage storage_controller")
                 .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone()))
+                            .arg(timeout_arg.clone())
+                            .arg(instance_id.clone())
+                            .arg(base_port))
                 .subcommand(Command::new("stop").about("Stop storage controller")
-                            .arg(stop_mode_arg.clone()))
+                            .arg(stop_mode_arg.clone())
+                            .arg(instance_id))
         )
         .subcommand(
             Command::new("safekeeper")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 15bbac702f..807519c88d 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,6 +156,11 @@ pub struct NeonStorageControllerConf {
     #[serde(with = "humantime_serde")]
     pub max_warming_up: Duration,
 
+    pub start_as_candidate: bool,
+
+    /// Database url used when running multiple storage controller instances
+    pub database_url: Option<SocketAddr>,
+
     /// Threshold for auto-splitting a tenant into shards
     pub split_threshold: Option<u64>,
 
@@ -174,6 +179,8 @@ impl Default for NeonStorageControllerConf {
         Self {
             max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
             max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
+            start_as_candidate: false,
+            database_url: None,
             split_threshold: None,
             max_secondary_lag_bytes: None,
         }
@@ -392,6 +399,36 @@ impl LocalEnv {
         }
     }
 
+    /// Inspect the base data directory and extract the instance id and instance directory path
+    /// for all storage controller instances
+    pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
+        let mut instances = Vec::default();
+
+        let dir = std::fs::read_dir(self.base_data_dir.clone())?;
+        for dentry in dir {
+            let dentry = dentry?;
+            let is_dir = dentry.metadata()?.is_dir();
+            let filename = dentry.file_name().into_string().unwrap();
+            let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
+                Some(suffix) => suffix.parse::<u8>().ok(),
+                None => None,
+            };
+
+            let is_instance_dir = is_dir && parsed_instance_id.is_some();
+
+            if !is_instance_dir {
+                continue;
+            }
+
+            instances.push((
+                parsed_instance_id.expect("Checked previously"),
+                dentry.path(),
+            ));
+        }
+
+        Ok(instances)
+    }
+
     pub fn register_branch_mapping(
         &mut self,
         branch_name: String,
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f180e922e8..2c077595a1 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,6 +3,8 @@ use crate::{
     local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
+use hyper::Uri;
+use nix::unistd::Pid;
 use pageserver_api::{
     controller_api::{
         NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
@@ -18,7 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, str::FromStr, time::Duration};
+use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -29,12 +31,14 @@ use utils::{
 
 pub struct StorageController {
     env: LocalEnv,
-    listen: String,
     private_key: Option<Vec<u8>>,
     public_key: Option<String>,
-    postgres_port: u16,
     client: reqwest::Client,
     config: NeonStorageControllerConf,
+
+    // The listen addresses is learned when starting the storage controller,
+    // hence the use of OnceLock to init it at the right time.
+    listen: OnceLock<SocketAddr>,
 }
 
 const COMMAND: &str = "storage_controller";
@@ -43,6 +47,36 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
 const DB_NAME: &str = "storage_controller";
 
+pub struct NeonStorageControllerStartArgs {
+    pub instance_id: u8,
+    pub base_port: Option<u16>,
+    pub start_timeout: humantime::Duration,
+}
+
+impl NeonStorageControllerStartArgs {
+    pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
+        Self {
+            instance_id: 1,
+            base_port: None,
+            start_timeout,
+        }
+    }
+}
+
+pub struct NeonStorageControllerStopArgs {
+    pub instance_id: u8,
+    pub immediate: bool,
+}
+
+impl NeonStorageControllerStopArgs {
+    pub fn with_default_instance_id(immediate: bool) -> Self {
+        Self {
+            instance_id: 1,
+            immediate,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -67,23 +101,6 @@ pub struct InspectResponse {
 
 impl StorageController {
     pub fn from_env(env: &LocalEnv) -> Self {
-        // Makes no sense to construct this if pageservers aren't going to use it: assume
-        // pageservers have control plane API set
-        let listen_url = env.control_plane_api.clone().unwrap();
-
-        let listen = format!(
-            "{}:{}",
-            listen_url.host_str().unwrap(),
-            listen_url.port().unwrap()
-        );
-
-        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
-        // port, for use by our captive postgres.
-        let postgres_port = listen_url
-            .port()
-            .expect("Control plane API setting should always have a port")
-            + 1;
-
         // Assume all pageservers have symmetric auth configuration: this service
         // expects to use one JWT token to talk to all of them.
         let ps_conf = env
@@ -126,20 +143,28 @@ impl StorageController {
 
         Self {
             env: env.clone(),
-            listen,
             private_key,
             public_key,
-            postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
             config: env.storage_controller.clone(),
+            listen: OnceLock::default(),
         }
     }
 
-    fn pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
-            .expect("non-Unicode path")
+    fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
+        self.env
+            .base_data_dir
+            .join(format!("storage_controller_{}", instance_id))
+    }
+
+    fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.storage_controller_instance_dir(instance_id)
+                .join("storage_controller.pid"),
+        )
+        .expect("non-Unicode path")
     }
 
     /// PIDFile for the postgres instance used to store storage controller state
@@ -184,9 +209,9 @@ impl StorageController {
     }
 
     /// Readiness check for our postgres process
-    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
         let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
 
         Ok(exitcode.success())
@@ -199,8 +224,8 @@ impl StorageController {
     /// who just want to run `cargo neon_local` without knowing about diesel.
     ///
     /// Returns the database url
-    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
+    pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
 
         let pg_bin_dir = self.get_pg_bin_dir().await?;
         let createdb_path = pg_bin_dir.join("createdb");
@@ -209,7 +234,7 @@ impl StorageController {
                 "-h",
                 "localhost",
                 "-p",
-                &format!("{}", self.postgres_port),
+                &format!("{}", postgres_port),
                 DB_NAME,
             ])
             .output()
@@ -230,13 +255,14 @@ impl StorageController {
 
     pub async fn connect_to_database(
         &self,
+        postgres_port: u16,
     ) -> anyhow::Result<(
         tokio_postgres::Client,
         tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
     )> {
         tokio_postgres::Config::new()
             .host("localhost")
-            .port(self.postgres_port)
+            .port(postgres_port)
             // The user is the ambient operating system user name.
             // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
             //
@@ -252,72 +278,115 @@ impl StorageController {
             .map_err(anyhow::Error::new)
     }
 
-    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the storage controller for persistence.
-        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
-            .unwrap()
-            .join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        let pg_lib_dir = self.get_pg_lib_dir().await?;
-        let pg_log_path = pg_data_path.join("postgres.log");
+    pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
+        let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
+        if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
+            if err.kind() != std::io::ErrorKind::AlreadyExists {
+                panic!("Failed to create instance dir {instance_dir:?}");
+            }
+        }
 
-        if !tokio::fs::try_exists(&pg_data_path).await? {
-            // Initialize empty database
-            let initdb_path = pg_bin_dir.join("initdb");
-            let mut child = Command::new(&initdb_path)
-                .envs(vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ])
-                .args(["-D", pg_data_path.as_ref()])
-                .spawn()
-                .expect("Failed to spawn initdb");
-            let status = child.wait().await?;
-            if !status.success() {
-                anyhow::bail!("initdb failed with status {status}");
+        let (listen, postgres_port) = {
+            if let Some(base_port) = start_args.base_port {
+                (
+                    format!("127.0.0.1:{base_port}"),
+                    self.config
+                        .database_url
+                        .expect("--base-port requires NeonStorageControllerConf::database_url")
+                        .port(),
+                )
+            } else {
+                let listen_url = self.env.control_plane_api.clone().unwrap();
+
+                let listen = format!(
+                    "{}:{}",
+                    listen_url.host_str().unwrap(),
+                    listen_url.port().unwrap()
+                );
+
+                (listen, listen_url.port().unwrap() + 1)
             }
         };
 
-        // Write a minimal config file:
-        // - Specify the port, since this is chosen dynamically
-        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-        //   the storage controller we don't want a slow local disk to interfere with that.
-        //
-        // NB: it's important that we rewrite this file on each start command so we propagate changes
-        // from `LocalEnv`'s config file (`.neon/config`).
-        tokio::fs::write(
-            &pg_data_path.join("postgresql.conf"),
-            format!("port = {}\nfsync=off\n", self.postgres_port),
-        )
-        .await?;
+        let socket_addr = listen
+            .parse()
+            .expect("listen address is a valid socket address");
+        self.listen
+            .set(socket_addr)
+            .expect("StorageController::listen is only set here");
 
-        println!("Starting storage controller database...");
-        let db_start_args = [
-            "-w",
-            "-D",
-            pg_data_path.as_ref(),
-            "-l",
-            pg_log_path.as_ref(),
-            "start",
-        ];
+        // Do we remove the pid file on stop?
+        let pg_started = self.is_postgres_running().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
 
-        background_process::start_process(
-            "storage_controller_db",
-            &self.env.base_data_dir,
-            pg_bin_dir.join("pg_ctl").as_std_path(),
-            db_start_args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
-            background_process::InitialPidFile::Create(self.postgres_pid_file()),
-            retry_timeout,
-            || self.pg_isready(&pg_bin_dir),
-        )
-        .await?;
+        if !pg_started {
+            // Start a vanilla Postgres process used by the storage controller for persistence.
+            let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+                .unwrap()
+                .join("storage_controller_db");
+            let pg_bin_dir = self.get_pg_bin_dir().await?;
+            let pg_log_path = pg_data_path.join("postgres.log");
 
-        // Run migrations on every startup, in case something changed.
-        let database_url = self.setup_database().await?;
+            if !tokio::fs::try_exists(&pg_data_path).await? {
+                // Initialize empty database
+                let initdb_path = pg_bin_dir.join("initdb");
+                let mut child = Command::new(&initdb_path)
+                    .envs(vec![
+                        ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                        ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ])
+                    .args(["-D", pg_data_path.as_ref()])
+                    .spawn()
+                    .expect("Failed to spawn initdb");
+                let status = child.wait().await?;
+                if !status.success() {
+                    anyhow::bail!("initdb failed with status {status}");
+                }
+            };
+
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
+            //
+            // NB: it's important that we rewrite this file on each start command so we propagate changes
+            // from `LocalEnv`'s config file (`.neon/config`).
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}\nfsync=off\n", postgres_port),
+            )
+            .await?;
+
+            println!("Starting storage controller database...");
+            let db_start_args = [
+                "-w",
+                "-D",
+                pg_data_path.as_ref(),
+                "-l",
+                pg_log_path.as_ref(),
+                "start",
+            ];
+
+            background_process::start_process(
+                "storage_controller_db",
+                &self.env.base_data_dir,
+                pg_bin_dir.join("pg_ctl").as_std_path(),
+                db_start_args,
+                vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ],
+                background_process::InitialPidFile::Create(self.postgres_pid_file()),
+                &start_args.start_timeout,
+                || self.pg_isready(&pg_bin_dir, postgres_port),
+            )
+            .await?;
+
+            // Run migrations on every startup, in case something changed.
+            self.setup_database(postgres_port).await?;
+        }
+
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
 
         // We support running a startup SQL script to fiddle with the database before we launch storcon.
         // This is used by the test suite.
@@ -339,7 +408,7 @@ impl StorageController {
                 }
             }
         };
-        let (mut client, conn) = self.connect_to_database().await?;
+        let (mut client, conn) = self.connect_to_database(postgres_port).await?;
         let conn = tokio::spawn(conn);
         let tx = client.build_transaction();
         let tx = tx.start().await?;
@@ -348,9 +417,20 @@ impl StorageController {
         drop(client);
         conn.await??;
 
+        let listen = self
+            .listen
+            .get()
+            .expect("cell is set earlier in this function");
+        let address_for_peers = Uri::builder()
+            .scheme("http")
+            .authority(format!("{}:{}", listen.ip(), listen.port()))
+            .path_and_query("")
+            .build()
+            .unwrap();
+
         let mut args = vec![
             "-l",
-            &self.listen,
+            &listen.to_string(),
             "--dev",
             "--database-url",
             &database_url,
@@ -358,10 +438,17 @@ impl StorageController {
             &humantime::Duration::from(self.config.max_offline).to_string(),
             "--max-warming-up-interval",
             &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--address-for-peers",
+            &address_for_peers.to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
         .collect::<Vec<_>>();
+
+        if self.config.start_as_candidate {
+            args.push("--start-as-candidate".to_string());
+        }
+
         if let Some(private_key) = &self.private_key {
             let claims = Claims::new(None, Scope::PageServerApi);
             let jwt_token =
@@ -394,15 +481,15 @@ impl StorageController {
 
         background_process::start_process(
             COMMAND,
-            &self.env.base_data_dir,
+            &instance_dir,
             &self.env.storage_controller_bin(),
             args,
             vec![
                 ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                 ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
             ],
-            background_process::InitialPidFile::Create(self.pid_file()),
-            retry_timeout,
+            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
+            &start_args.start_timeout,
             || async {
                 match self.ready().await {
                     Ok(_) => Ok(true),
@@ -415,8 +502,35 @@ impl StorageController {
         Ok(())
     }
 
-    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+    pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
+        background_process::stop_process(
+            stop_args.immediate,
+            COMMAND,
+            &self.pid_file(stop_args.instance_id),
+        )?;
+
+        let storcon_instances = self.env.storage_controller_instances().await?;
+        for (instance_id, instanced_dir_path) in storcon_instances {
+            if instance_id == stop_args.instance_id {
+                continue;
+            }
+
+            let pid_file = instanced_dir_path.join("storage_controller.pid");
+            let pid = tokio::fs::read_to_string(&pid_file)
+                .await
+                .map_err(|err| {
+                    anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
+                })?
+                .parse::<i32>()
+                .expect("pid is valid i32");
+
+            let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
+            if other_proc_alive {
+                // There is another storage controller instance running, so we return
+                // and leave the database running.
+                return Ok(());
+            }
+        }
 
         let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -429,27 +543,51 @@ impl StorageController {
             .wait()
             .await?;
         if !stop_status.success() {
-            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-                .args(pg_status_args)
-                .spawn()?
-                .wait()
-                .await?;
-
-            // pg_ctl status returns this exit code if postgres is not running: in this case it is
-            // fine that stop failed.  Otherwise it is an error that stop failed.
-            const PG_STATUS_NOT_RUNNING: i32 = 3;
-            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Storage controller database is already stopped");
-                return Ok(());
-            } else {
-                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
+            match self.is_postgres_running().await {
+                Ok(false) => {
+                    println!("Storage controller database is already stopped");
+                    return Ok(());
+                }
+                Ok(true) => {
+                    anyhow::bail!("Failed to stop storage controller database");
+                }
+                Err(err) => {
+                    anyhow::bail!("Failed to stop storage controller database: {err}");
+                }
             }
         }
 
         Ok(())
     }
 
+    async fn is_postgres_running(&self) -> anyhow::Result<bool> {
+        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_status_args)
+            .spawn()?
+            .wait()
+            .await?;
+
+        // pg_ctl status returns this exit code if postgres is not running: in this case it is
+        // fine that stop failed.  Otherwise it is an error that stop failed.
+        const PG_STATUS_NOT_RUNNING: i32 = 3;
+        const PG_NO_DATA_DIR: i32 = 4;
+        const PG_STATUS_RUNNING: i32 = 0;
+        match status_exitcode.code() {
+            Some(PG_STATUS_NOT_RUNNING) => Ok(false),
+            Some(PG_NO_DATA_DIR) => Ok(false),
+            Some(PG_STATUS_RUNNING) => Ok(true),
+            Some(code) => Err(anyhow::anyhow!(
+                "pg_ctl status returned unexpected status code: {:?}",
+                code
+            )),
+            None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
+        }
+    }
+
     fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
         let category = match path.find('/') {
             Some(idx) => &path[..idx],
@@ -475,15 +613,31 @@ impl StorageController {
         RQ: Serialize + Sized,
         RS: DeserializeOwned + Sized,
     {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let listen_url = self.env.control_plane_api.clone().unwrap();
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            listen_url.host_str().unwrap(),
-            listen_url.port().unwrap()
-        ))
-        .unwrap();
+        // In the special case of the `storage_controller start` subcommand, we wish
+        // to use the API endpoint of the newly started storage controller in order
+        // to pass the readiness check. In this scenario [`Self::listen`] will be set
+        // (see [`Self::start`]).
+        //
+        // Otherwise, we infer the storage controller api endpoint from the configured
+        // control plane API.
+        let url = if let Some(socket_addr) = self.listen.get() {
+            Url::from_str(&format!(
+                "http://{}:{}/{path}",
+                socket_addr.ip().to_canonical(),
+                socket_addr.port()
+            ))
+            .unwrap()
+        } else {
+            // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+            // for general purpose API access.
+            let listen_url = self.env.control_plane_api.clone().unwrap();
+            Url::from_str(&format!(
+                "http://{}:{}/{path}",
+                listen_url.host_str().unwrap(),
+                listen_url.port().unwrap()
+            ))
+            .unwrap()
+        };
 
         let mut builder = self.client.request(method, url);
         if let Some(body) = body {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e755aaed19..7bbd1541cf 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -520,6 +520,19 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, node_status)
 }
 
+async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let leader = state.service.get_leader().await.map_err(|err| {
+        ApiError::InternalServerError(anyhow::anyhow!(
+            "Failed to read leader from database: {err}"
+        ))
+    })?;
+
+    json_response(StatusCode::OK, leader)
+}
+
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1016,6 +1029,9 @@ pub fn make_router(
         .get("/control/v1/node/:node_id", |r| {
             named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
         })
+        .get("/control/v1/leader", |r| {
+            named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
+        })
         .put("/control/v1/node/:node_id/drain", |r| {
             named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
         })
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index ebb59a1720..3f8520fe55 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,7 +1,7 @@
 use crate::tenant_shard::ObservedState;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 use tokio_util::sync::CancellationToken;
 
 use hyper::Uri;
@@ -69,6 +69,8 @@ impl PeerClient {
             req
         };
 
+        let req = req.timeout(Duration::from_secs(2));
+
         let res = req
             .send()
             .await
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 84db088a42..3459b44774 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -20,7 +20,8 @@ use crate::{
     metrics,
     peer_client::{GlobalObservedState, PeerClient},
     persistence::{
-        AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
+        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
+        TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -489,11 +490,6 @@ pub(crate) enum ReconcileResultRequest {
     Stop,
 }
 
-struct LeaderStepDownState {
-    observed: GlobalObservedState,
-    leader: ControllerPersistence,
-}
-
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -504,7 +500,8 @@ impl Service {
     #[instrument(skip_all)]
     async fn startup_reconcile(
         self: &Arc<Service>,
-        leader_step_down_state: Option<LeaderStepDownState>,
+        current_leader: Option<ControllerPersistence>,
+        leader_step_down_state: Option<GlobalObservedState>,
         bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
             Result<(), (TenantShardId, NotifyError)>,
         >,
@@ -522,17 +519,15 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
-        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
+        let observed = if let Some(state) = leader_step_down_state {
             tracing::info!(
                 "Using observed state received from leader at {}",
-                state.leader.address,
+                current_leader.as_ref().unwrap().address
             );
-            (state.observed, Some(state.leader))
+
+            state
         } else {
-            (
-                self.build_global_observed_state(node_scan_deadline).await,
-                None,
-            )
+            self.build_global_observed_state(node_scan_deadline).await
         };
 
         // Accumulate a list of any tenant locations that ought to be detached
@@ -1382,13 +1377,32 @@ impl Service {
                 };
 
                 let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let peer_observed_state = match leadership_status {
-                    LeadershipStatus::Candidate => this.request_step_down().await,
+                let leader = match this.get_leader().await {
+                    Ok(ok) => ok,
+                    Err(err) => {
+                        tracing::error!(
+                            "Failed to query database for current leader: {err}. Aborting start-up ..."
+                        );
+                        std::process::exit(1);
+                    }
+                };
+
+                let leader_step_down_state = match leadership_status {
+                    LeadershipStatus::Candidate => {
+                        if let Some(ref leader) = leader {
+                            this.request_step_down(leader).await
+                        } else {
+                            tracing::info!(
+                                "No leader found to request step down from. Will build observed state."
+                            );
+                            None
+                        }
+                    }
                     LeadershipStatus::Leader => None,
                     LeadershipStatus::SteppedDown => unreachable!(),
                 };
 
-                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
+                this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                     .await;
 
                 drop(startup_completion);
@@ -4650,6 +4664,10 @@ impl Service {
             ))
     }
 
+    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        self.persistence.get_leader().await
+    }
+
     pub(crate) async fn node_register(
         &self,
         register_req: NodeRegisterRequest,
@@ -6342,6 +6360,7 @@ impl Service {
 
     pub(crate) async fn step_down(&self) -> GlobalObservedState {
         tracing::info!("Received step down request from peer");
+        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
 
         self.inner.write().unwrap().step_down();
         // TODO: would it make sense to have a time-out for this?
@@ -6367,50 +6386,31 @@ impl Service {
     ///
     /// On failures to query the database or step down error responses the process is killed
     /// and we rely on k8s to retry.
-    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
-        let leader = match self.persistence.get_leader().await {
-            Ok(leader) => leader,
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        // TODO: jwt token
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
             Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
                 tracing::error!(
-                    "Failed to query database for current leader: {err}. Aborting start-up ..."
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
                 );
-                std::process::exit(1);
-            }
-        };
 
-        match leader {
-            Some(leader) => {
-                tracing::info!("Sending step down request to {leader:?}");
-
-                // TODO: jwt token
-                let client = PeerClient::new(
-                    Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-                    self.config.jwt_token.clone(),
-                );
-                let state = client.step_down(&self.cancel).await;
-                match state {
-                    Ok(state) => Some(LeaderStepDownState {
-                        observed: state,
-                        leader: leader.clone(),
-                    }),
-                    Err(err) => {
-                        // TODO: Make leaders periodically update a timestamp field in the
-                        // database and, if the leader is not reachable from the current instance,
-                        // but inferred as alive from the timestamp, abort start-up. This avoids
-                        // a potential scenario in which we have two controllers acting as leaders.
-                        tracing::error!(
-                            "Leader ({}) did not respond to step-down request: {}",
-                            leader.address,
-                            err
-                        );
-                        None
-                    }
-                }
-            }
-            None => {
-                tracing::info!(
-                    "No leader found to request step down from. Will build observed state."
-                );
                 None
             }
         }
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4b0c9ac71d..996ca4d652 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -3,6 +3,7 @@ pytest_plugins = (
     "fixtures.parametrize",
     "fixtures.httpserver",
     "fixtures.compute_reconfigure",
+    "fixtures.storage_controller_proxy",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b76432127d..ec5a83601e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -497,6 +497,7 @@ class NeonEnvBuilder:
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
+        storage_controller_port_override: Optional[int] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -549,6 +550,8 @@ class NeonEnvBuilder:
 
         self.safekeeper_extra_opts = safekeeper_extra_opts
 
+        self.storage_controller_port_override = storage_controller_port_override
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1054,6 +1057,7 @@ class NeonEnv:
     """
 
     BASE_PAGESERVER_ID = 1
+    storage_controller: NeonStorageController | NeonProxiedStorageController
 
     def __init__(self, config: NeonEnvBuilder):
         self.repo_dir = config.repo_dir
@@ -1084,27 +1088,41 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        # Find two adjacent ports for storage controller and its postgres DB.  This
-        # loop would eventually throw from get_port() if we run out of ports (extremely
-        # unlikely): usually we find two adjacent free ports on the first iteration.
-        while True:
-            self.storage_controller_port = self.port_distributor.get_port()
-            storage_controller_pg_port = self.port_distributor.get_port()
-            if storage_controller_pg_port == self.storage_controller_port + 1:
-                break
-
         # The URL for the pageserver to use as its control_plane_api config
-        self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
-        # The base URL of the storage controller
-        self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"
+        if config.storage_controller_port_override is not None:
+            log.info(
+                f"Using storage controller api override {config.storage_controller_port_override}"
+            )
+
+            self.storage_controller_port = config.storage_controller_port_override
+            self.storage_controller = NeonProxiedStorageController(
+                self, config.storage_controller_port_override, config.auth_enabled
+            )
+        else:
+            # Find two adjacent ports for storage controller and its postgres DB.  This
+            # loop would eventually throw from get_port() if we run out of ports (extremely
+            # unlikely): usually we find two adjacent free ports on the first iteration.
+            while True:
+                storage_controller_port = self.port_distributor.get_port()
+                storage_controller_pg_port = self.port_distributor.get_port()
+                if storage_controller_pg_port == storage_controller_port + 1:
+                    break
+
+            self.storage_controller_port = storage_controller_port
+            self.storage_controller = NeonStorageController(
+                self, storage_controller_port, config.auth_enabled
+            )
+
+            log.info(
+                f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}"
+            )
+
+        self.storage_controller_api: str = self.storage_controller.api_root()
+        self.control_plane_api: str = self.storage_controller.upcall_api_endpoint()
 
         # For testing this with a fake HTTP server, enable passing through a URL from config
         self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
 
-        self.storage_controller: NeonStorageController = NeonStorageController(
-            self, config.auth_enabled
-        )
-
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
 
@@ -1869,16 +1887,24 @@ class NeonCli(AbstractNeonCli):
     def storage_controller_start(
         self,
         timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
     ):
         cmd = ["storage_controller", "start"]
         if timeout_in_seconds is not None:
             cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        if instance_id is not None:
+            cmd.append(f"--instance-id={instance_id}")
+        if base_port is not None:
+            cmd.append(f"--base-port={base_port}")
         return self.raw_cli(cmd)
 
-    def storage_controller_stop(self, immediate: bool):
+    def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None):
         cmd = ["storage_controller", "stop"]
         if immediate:
             cmd.extend(["-m", "immediate"])
+        if instance_id is not None:
+            cmd.append(f"--instance-id={instance_id}")
         return self.raw_cli(cmd)
 
     def pageserver_start(
@@ -2189,17 +2215,30 @@ class PageserverSchedulingPolicy(str, Enum):
     PAUSE_FOR_RESTART = "PauseForRestart"
 
 
+class StorageControllerLeadershipStatus(str, Enum):
+    LEADER = "leader"
+    STEPPED_DOWN = "stepped_down"
+    CANDIDATE = "candidate"
+
+
 class NeonStorageController(MetricsGetter, LogUtils):
-    def __init__(self, env: NeonEnv, auth_enabled: bool):
+    def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
         self.env = env
+        self.port: int = port
+        self.api: str = f"http://127.0.0.1:{port}"
         self.running = False
         self.auth_enabled = auth_enabled
         self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
-        self.logfile = self.workdir / "storage_controller.log"
+        self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log"
 
-    def start(self, timeout_in_seconds: Optional[int] = None):
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
+    ):
         assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
         self.running = True
         return self
 
@@ -2209,6 +2248,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.running = False
         return self
 
+    def upcall_api_endpoint(self) -> str:
+        return f"{self.api}/upcall/v1"
+
+    def api_root(self) -> str:
+        return self.api
+
     @staticmethod
     def retryable_node_operation(op, ps_id, max_attempts, backoff):
         while max_attempts > 0:
@@ -2237,7 +2282,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
     def assert_no_errors(self):
         assert_no_errors(
-            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
+            self.logfile,
+            "storage_controller",
+            self.allowed_errors,
         )
 
     def pageserver_api(self) -> PageserverHttpClient:
@@ -2249,7 +2296,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)
+        return PageserverHttpClient(self.port, lambda: True, auth_token)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
@@ -2266,13 +2313,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
         return headers
 
     def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
+        res = self.request("GET", f"{self.api}/metrics")
         return parse_metrics(res.text)
 
     def ready(self) -> bool:
         status = None
         try:
-            resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
+            resp = self.request("GET", f"{self.api}/ready")
             status = resp.status_code
         except StorageControllerApiException as e:
             status = e.status_code
@@ -2305,7 +2352,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
+            f"{self.api}/debug/v1/attach-hook",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2316,7 +2363,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
+            f"{self.api}/debug/v1/attach-hook",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2327,7 +2374,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/inspect",
+            f"{self.api}/debug/v1/inspect",
             json={"tenant_shard_id": str(tenant_shard_id)},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2350,7 +2397,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_register({body})")
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/node",
+            f"{self.api}/control/v1/node",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2359,7 +2406,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_delete({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            f"{self.api}/control/v1/node/{node_id}",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2367,7 +2414,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_drain({node_id})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            f"{self.api}/control/v1/node/{node_id}/drain",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2375,7 +2422,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"cancel_node_drain({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            f"{self.api}/control/v1/node/{node_id}/drain",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2383,7 +2430,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"node_fill({node_id})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            f"{self.api}/control/v1/node/{node_id}/fill",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2391,14 +2438,22 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"cancel_node_fill({node_id})")
         self.request(
             "DELETE",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            f"{self.api}/control/v1/node/{node_id}/fill",
             headers=self.headers(TokenScope.ADMIN),
         )
 
     def node_status(self, node_id):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            f"{self.api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
+    def get_leader(self):
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/leader",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2406,7 +2461,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def node_list(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/node",
+            f"{self.api}/control/v1/node",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2414,7 +2469,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_list(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/debug/v1/tenant",
+            f"{self.api}/debug/v1/tenant",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2424,7 +2479,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         body["node_id"] = node_id
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
+            f"{self.api}/control/v1/node/{node_id}/config",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2459,7 +2514,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/v1/tenant",
+            f"{self.api}/v1/tenant",
             json=body,
             headers=self.headers(TokenScope.PAGE_SERVER_API),
         )
@@ -2472,7 +2527,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
+            f"{self.api}/debug/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
@@ -2485,7 +2540,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
+            f"{self.api}/control/v1/tenant/{tenant_id}",
             headers=self.headers(TokenScope.ADMIN),
         )
         response.raise_for_status()
@@ -2496,7 +2551,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     ) -> list[TenantShardId]:
         response = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
+            f"{self.api}/control/v1/tenant/{tenant_id}/shard_split",
             json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2508,7 +2563,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
+            f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
             json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2519,7 +2574,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"tenant_policy_update({tenant_id}, {body})")
         self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
+            f"{self.api}/control/v1/tenant/{tenant_id}/policy",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2527,14 +2582,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def tenant_import(self, tenant_id: TenantId):
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
+            f"{self.api}/debug/v1/tenant/{tenant_id}/import",
             headers=self.headers(TokenScope.ADMIN),
         )
 
     def reconcile_all(self):
         r = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
+            f"{self.api}/debug/v1/reconcile_all",
             headers=self.headers(TokenScope.ADMIN),
         )
         r.raise_for_status()
@@ -2567,7 +2622,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/debug/v1/consistency_check",
+            f"{self.api}/debug/v1/consistency_check",
             headers=self.headers(TokenScope.ADMIN),
         )
         log.info("storage controller passed consistency check")
@@ -2640,7 +2695,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
+            f"{self.api}/control/v1/metadata_health/update",
             json=body,
             headers=self.headers(TokenScope.SCRUBBER),
         )
@@ -2648,7 +2703,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
     def metadata_health_list_unhealthy(self):
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
+            f"{self.api}/control/v1/metadata_health/unhealthy",
             headers=self.headers(TokenScope.ADMIN),
         )
         return response.json()
@@ -2658,7 +2713,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         response = self.request(
             "POST",
-            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
+            f"{self.api}/control/v1/metadata_health/outdated",
             json=body,
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2681,7 +2736,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info("Asking storage controller to step down")
         response = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/control/v1/step_down",
+            f"{self.api}/control/v1/step_down",
             headers=self.headers(TokenScope.ADMIN),
         )
 
@@ -2698,7 +2753,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
 
         res = self.request(
             "PUT",
-            f"{self.env.storage_controller_api}/debug/v1/failpoints",
+            f"{self.api}/debug/v1/failpoints",
             json=[{"name": name, "actions": actions} for name, actions in pairs],
             headers=self.headers(TokenScope.ADMIN),
         )
@@ -2768,9 +2823,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 parsed_tid, wait_ms=250
             )
 
-    @property
-    def workdir(self) -> Path:
-        return self.env.repo_dir
+    def get_leadership_status(self) -> StorageControllerLeadershipStatus:
+        metric_values = {}
+        for status in StorageControllerLeadershipStatus:
+            metric_value = self.get_metric_value(
+                "storage_controller_leadership_status", filter={"status": status}
+            )
+            metric_values[status] = metric_value
+
+        assert list(metric_values.values()).count(1) == 1
+
+        for status, metric_value in metric_values.items():
+            if metric_value == 1:
+                return status
+
+        raise AssertionError("unreachable")
 
     def __enter__(self) -> "NeonStorageController":
         return self
@@ -2784,6 +2851,59 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.stop(immediate=True)
 
 
+class NeonProxiedStorageController(NeonStorageController):
+    def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool):
+        super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled)
+        self.instances: dict[int, dict[str, Any]] = {}
+
+    def start(
+        self,
+        timeout_in_seconds: Optional[int] = None,
+        instance_id: Optional[int] = None,
+        base_port: Optional[int] = None,
+    ):
+        assert instance_id is not None and base_port is not None
+
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.instances[instance_id] = {"running": True}
+
+        self.running = True
+        return self
+
+    def stop_instance(
+        self, immediate: bool = False, instance_id: Optional[int] = None
+    ) -> "NeonStorageController":
+        assert instance_id in self.instances
+        if self.instances[instance_id]["running"]:
+            self.env.neon_cli.storage_controller_stop(immediate, instance_id)
+            self.instances[instance_id]["running"] = False
+
+        self.running = any(meta["running"] for meta in self.instances.values())
+        return self
+
+    def stop(self, immediate: bool = False) -> "NeonStorageController":
+        for iid, details in self.instances.items():
+            if details["running"]:
+                self.env.neon_cli.storage_controller_stop(immediate, iid)
+                self.instances[iid]["running"] = False
+
+        self.running = False
+        return self
+
+    def assert_no_errors(self):
+        for instance_id in self.instances.keys():
+            assert_no_errors(
+                self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log",
+                "storage_controller",
+                self.allowed_errors,
+            )
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
+        raise NotImplementedError()
+
+
 @dataclass
 class LogCursor:
     _line_no: int
@@ -4520,7 +4640,7 @@ class StorageScrubber:
 
         base_args = [
             str(self.env.neon_binpath / "storage_scrubber"),
-            f"--controller-api={self.env.storage_controller_api}",
+            f"--controller-api={self.env.storage_controller.api_root()}",
         ]
         args = base_args + args
 
diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py
new file mode 100644
index 0000000000..3477f8b1f2
--- /dev/null
+++ b/test_runner/fixtures/storage_controller_proxy.py
@@ -0,0 +1,73 @@
+import re
+from typing import Any, Optional
+
+import pytest
+import requests
+from pytest_httpserver import HTTPServer
+from werkzeug.datastructures import Headers
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+from fixtures.log_helper import log
+
+
+class StorageControllerProxy:
+    def __init__(self, server: HTTPServer):
+        self.server: HTTPServer = server
+        self.listen: str = f"http://{server.host}:{server.port}"
+        self.routing_to: Optional[str] = None
+
+    def route_to(self, storage_controller_api: str):
+        self.routing_to = storage_controller_api
+
+    def port(self) -> int:
+        return self.server.port
+
+    def upcall_api_endpoint(self) -> str:
+        return f"{self.listen}/upcall/v1"
+
+
+def proxy_request(method: str, url: str, **kwargs) -> requests.Response:
+    return requests.request(method, url, **kwargs)
+
+
+@pytest.fixture(scope="function")
+def storage_controller_proxy(make_httpserver):
+    """
+    Proxies requests into the storage controller to the currently
+    selected storage controller instance via `StorageControllerProxy.route_to`.
+
+    This fixture is intended for tests that need to run multiple instances
+    of the storage controller at the same time.
+    """
+    server = make_httpserver
+
+    self = StorageControllerProxy(server)
+
+    log.info(f"Storage controller proxy listening on {self.listen}")
+
+    def handler(request: Request):
+        if self.route_to is None:
+            log.info(f"Storage controller proxy has no routing configured for {request.url}")
+            return Response("Routing not configured", status=503)
+
+        route_to_url = f"{self.routing_to}{request.path}"
+
+        log.info(f"Routing {request.url} to {route_to_url}")
+
+        args: dict[str, Any] = {"headers": request.headers}
+        if request.is_json:
+            args["json"] = request.json
+
+        response = proxy_request(request.method, route_to_url, **args)
+
+        headers = Headers()
+        for key, value in response.headers.items():
+            headers.add(key, value)
+
+        return Response(response.content, headers=headers, status=response.status_code)
+
+    self.server.expect_request(re.compile(".*")).respond_with_handler(handler)
+
+    yield self
+    server.clear()
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 4dc9f7caae..80f1c9e4e3 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -403,7 +403,7 @@ def wait_until(
         try:
             res = func()
         except Exception as e:
-            log.info("waiting for %s iteration %s failed", func, i + 1)
+            log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
             last_exception = e
             if show_intermediate_error:
                 log.info(e)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 7d98ff2923..95c35e9641 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import json
 import threading
 import time
@@ -16,6 +17,7 @@ from fixtures.neon_fixtures import (
     PageserverSchedulingPolicy,
     PgBin,
     StorageControllerApiException,
+    StorageControllerLeadershipStatus,
     TokenScope,
     last_flush_lsn_upload,
 )
@@ -30,7 +32,9 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
+from fixtures.storage_controller_proxy import StorageControllerProxy
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
@@ -2093,6 +2097,131 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     )
 
 
+# This is a copy of NeonEnv.start which injects the instance id and port
+# into the call to NeonStorageController.start
+def start_env(env: NeonEnv, storage_controller_port: int):
+    timeout_in_seconds = 30
+
+    # Storage controller starts first, so that pageserver /re-attach calls don't
+    # bounce through retries on startup
+    env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port)
+
+    # Wait for storage controller readiness to prevent unnecessary post start-up
+    # reconcile.
+    env.storage_controller.wait_until_ready()
+
+    # Start up broker, pageserver and all safekeepers
+    futs = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
+    ) as executor:
+        futs.append(
+            executor.submit(lambda: env.broker.try_start() or None)
+        )  # The `or None` is for the linter
+
+        for pageserver in env.pageservers:
+            futs.append(
+                executor.submit(
+                    lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
+                )
+            )
+
+        for safekeeper in env.safekeepers:
+            futs.append(
+                executor.submit(
+                    lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
+                )
+            )
+
+    for f in futs:
+        f.result()
+
+
+@pytest.mark.parametrize("step_down_times_out", [False, True])
+def test_storage_controller_leadership_transfer(
+    neon_env_builder: NeonEnvBuilder,
+    storage_controller_proxy: StorageControllerProxy,
+    port_distributor: PortDistributor,
+    step_down_times_out: bool,
+):
+    neon_env_builder.num_pageservers = 3
+
+    neon_env_builder.storage_controller_config = {
+        "database_url": f"127.0.0.1:{port_distributor.get_port()}",
+        "start_as_candidate": True,
+    }
+
+    neon_env_builder.storage_controller_port_override = storage_controller_proxy.port()
+
+    storage_controller_1_port = port_distributor.get_port()
+    storage_controller_2_port = port_distributor.get_port()
+
+    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+
+    env = neon_env_builder.init_configs()
+    start_env(env, storage_controller_1_port)
+
+    assert (
+        env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER
+    )
+    leader = env.storage_controller.get_leader()
+    assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/"
+
+    if step_down_times_out:
+        env.storage_controller.configure_failpoints(
+            ("sleep-on-step-down-handling", "return(10000)")
+        )
+        env.storage_controller.allowed_errors.append(".*request was dropped before completing.*")
+
+    tenant_count = 2
+    shard_count = 4
+    tenants = set(TenantId.generate() for _ in range(0, tenant_count))
+
+    for tid in tenants:
+        env.storage_controller.tenant_create(
+            tid, shard_count=shard_count, placement_policy={"Attached": 1}
+        )
+    env.storage_controller.reconcile_until_idle()
+
+    env.storage_controller.start(
+        timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
+    )
+
+    if not step_down_times_out:
+
+        def previous_stepped_down():
+            assert (
+                env.storage_controller.get_leadership_status()
+                == StorageControllerLeadershipStatus.STEPPED_DOWN
+            )
+
+        wait_until(5, 1, previous_stepped_down)
+
+    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
+
+    def new_becomes_leader():
+        assert (
+            env.storage_controller.get_leadership_status()
+            == StorageControllerLeadershipStatus.LEADER
+        )
+
+    wait_until(15, 1, new_becomes_leader)
+    leader = env.storage_controller.get_leader()
+    assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
+
+    env.storage_controller.wait_until_ready()
+    env.storage_controller.consistency_check()
+
+    if step_down_times_out:
+        env.storage_controller.allowed_errors.extend(
+            [
+                ".*Leader.*did not respond to step-down request.*",
+                ".*Send step down request failed.*",
+                ".*Send step down request still failed.*",
+            ]
+        )
+
+
 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
     # single unsharded tenant, two locations
     neon_env_builder.num_pageservers = 2

From 25e7d321f474e5cbc5ac53ed42de697a48db50db Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Aug 2024 12:51:05 +0300
Subject: [PATCH 13/44] safekeeper: cross check divergence point in
 ProposerElected handling.

Previously, we protected from multiple ProposerElected messages from the same
walproposer with the following condition:

msg.term == self.get_last_log_term() && self.flush_lsn() >
msg.start_streaming_at

It is not exhaustive, i.e. we could still proceed to truncating WAL even though
safekeeper inserted something since the divergence point has been
calculated. While it was most likely safe because walproposer can't use
safekeeper position to commit WAL until last_log_term reaches the current
walproposer term, let's be more careful and properly calculate the divergence
point like walproposer does.
---
 safekeeper/src/safekeeper.rs | 62 +++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 33ec39b852..0814d9ba67 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -92,7 +92,7 @@ impl TermHistory {
     }
 
     /// Find point of divergence between leader (walproposer) term history and
-    /// safekeeper. Arguments are not symmetrics as proposer history ends at
+    /// safekeeper. Arguments are not symmetric as proposer history ends at
     /// +infinity while safekeeper at flush_lsn.
     /// C version is at walproposer SendProposerElected.
     pub fn find_highest_common_point(
@@ -701,7 +701,13 @@ where
             .with_label_values(&["handle_elected"])
             .start_timer();
 
-        info!("received ProposerElected {:?}", msg);
+        info!(
+            "received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}",
+            msg,
+            self.state.acceptor_state.term,
+            self.get_last_log_term(),
+            self.flush_lsn()
+        );
         if self.state.acceptor_state.term < msg.term {
             let mut state = self.state.start_change();
             state.acceptor_state.term = msg.term;
@@ -713,22 +719,43 @@ where
             return Ok(None);
         }
 
-        // This might happen in a rare race when another (old) connection from
-        // the same walproposer writes + flushes WAL after this connection
-        // already sent flush_lsn in VoteRequest. It is generally safe to
-        // proceed, but to prevent commit_lsn surprisingly going down we should
-        // either refuse the session (simpler) or skip the part we already have
-        // from the stream (can be implemented).
-        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
-            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
-                   msg.term, self.flush_lsn(), msg.start_streaming_at)
+        // Before truncating WAL check-cross the check divergence point received
+        // from the walproposer.
+        let sk_th = self.get_term_history();
+        let last_common_point = match TermHistory::find_highest_common_point(
+            &msg.term_history,
+            &sk_th,
+            self.flush_lsn(),
+        ) {
+            // No common point. Expect streaming from the beginning of the
+            // history like walproposer while we don't have proper init.
+            None => *msg.term_history.0.first().ok_or(anyhow::anyhow!(
+                "empty walproposer term history {:?}",
+                msg.term_history
+            ))?,
+            Some(lcp) => lcp,
+        };
+        // This is expected to happen in a rare race when another connection
+        // from the same walproposer writes + flushes WAL after this connection
+        // sent flush_lsn in VoteRequest; for instance, very late
+        // ProposerElected message delivery after another connection was
+        // established and wrote WAL. In such cases error is transient;
+        // reconnection makes safekeeper send newest term history and flush_lsn
+        // and walproposer recalculates the streaming point. OTOH repeating
+        // error indicates a serious bug.
+        if last_common_point.lsn != msg.start_streaming_at {
+            bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
+                    last_common_point, msg.start_streaming_at,
+                    self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
+            );
         }
-        // Otherwise we must never attempt to truncate committed data.
+
+        // We are also expected to never attempt to truncate committed data.
         assert!(
             msg.start_streaming_at >= self.state.inmem.commit_lsn,
-            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
-            msg.start_streaming_at,
-            self.state.inmem.commit_lsn
+            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
+            msg.start_streaming_at, self.state.inmem.commit_lsn,
+            self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
         );
 
         // Before first WAL write initialize its segment. It makes first segment
@@ -743,9 +770,6 @@ where
                 .await?;
         }
 
-        // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
-        // intersection of our history and history from msg
-
         // truncate wal, update the LSNs
         self.wal_store.truncate_wal(msg.start_streaming_at).await?;
 
@@ -1069,7 +1093,7 @@ mod tests {
 
         let pem = ProposerElected {
             term: 1,
-            start_streaming_at: Lsn(1),
+            start_streaming_at: Lsn(3),
             term_history: TermHistory(vec![TermLsn {
                 term: 1,
                 lsn: Lsn(3),

From e2d89f7991bc9cea88661e50722a02346b7b6485 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 16 Aug 2024 13:35:02 +0100
Subject: [PATCH 14/44] pageserver: prioritize secondary downloads to get most
 recent layers first, except l0s (#8729)

## Problem

When a secondary location is trying to catch up while a tenant is
receiving new writes, it can become quite wasteful:
- Downloading L0s which are soon destroyed by compaction to L1s
- Downloading older layer files which are soon made irrelevant when
covered by image layers.

## Summary of changes

Sort the layer files in the heatmap:
- L0 layers are the lowest priority
- Other layers are sorted to download the highest LSNs first.
---
 pageserver/src/tenant/secondary/heatmap.rs |   8 +-
 pageserver/src/tenant/timeline.rs          | 134 +++++++++++++++++++--
 2 files changed, 130 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 166483ba5d..4a8e66d38a 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapTimeline {
     #[serde_as(as = "DisplayFromStr")]
-    pub(super) timeline_id: TimelineId,
+    pub(crate) timeline_id: TimelineId,
 
-    pub(super) layers: Vec<HeatMapLayer>,
+    pub(crate) layers: Vec<HeatMapLayer>,
 }
 
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerName,
-    pub(super) metadata: LayerFileMetadata,
+    pub(crate) name: LayerName,
+    pub(crate) metadata: LayerFileMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
     pub(super) access_time: SystemTime,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 01e77fa1b1..26dc87c373 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2977,11 +2977,7 @@ impl Timeline {
                 LayerVisibilityHint::Visible => {
                     // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
                     let last_activity_ts = layer.latest_activity();
-                    Some(HeatMapLayer::new(
-                        layer.layer_desc().layer_name(),
-                        layer.metadata(),
-                        last_activity_ts,
-                    ))
+                    Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
                 }
                 LayerVisibilityHint::Covered => {
                     // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -2990,7 +2986,23 @@ impl Timeline {
             }
         });
 
-        let layers = resident.collect();
+        let mut layers = resident.collect::<Vec<_>>();
+
+        // Sort layers in order of which to download first.  For a large set of layers to download, we
+        // want to prioritize those layers which are most likely to still be in the resident many minutes
+        // or hours later:
+        // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
+        //   only exist for a few minutes before being compacted into L1s.
+        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
+        //   the layer is likely to be covered by an image layer during compaction.
+        layers.sort_by_key(|(desc, _meta, _atime)| {
+            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
+        });
+
+        let layers = layers
+            .into_iter()
+            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
+            .collect();
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
@@ -4516,6 +4528,7 @@ impl DurationRecorder {
 /// the layer descriptor requires the user to provide the ranges, which should cover all
 /// keys specified in the `data` field.
 #[cfg(test)]
+#[derive(Clone)]
 pub struct DeltaLayerTestDesc {
     pub lsn_range: Range<Lsn>,
     pub key_range: Range<Key>,
@@ -4545,6 +4558,13 @@ impl DeltaLayerTestDesc {
             data,
         }
     }
+
+    pub(crate) fn layer_name(&self) -> LayerName {
+        LayerName::Delta(super::storage_layer::DeltaLayerName {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+        })
+    }
 }
 
 impl Timeline {
@@ -5768,12 +5788,110 @@ fn is_send() {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::key::Key;
     use utils::{id::TimelineId, lsn::Lsn};
 
-    use crate::tenant::{
-        harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
+    use crate::{
+        repository::Value,
+        tenant::{
+            harness::{test_img, TenantHarness},
+            layer_map::LayerMap,
+            storage_layer::{Layer, LayerName},
+            timeline::{DeltaLayerTestDesc, EvictionError},
+            Timeline,
+        },
     };
 
+    #[tokio::test]
+    async fn test_heatmap_generation() {
+        let harness = TenantHarness::create("heatmap_generation").await.unwrap();
+
+        let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
+            Lsn(0x10)..Lsn(0x20),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                Lsn(0x11),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
+            Lsn(0x10)..Lsn(0x20),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x11),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let l0_delta = DeltaLayerTestDesc::new(
+            Lsn(0x20)..Lsn(0x30),
+            Key::from_hex("000000000000000000000000000000000000").unwrap()
+                ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x25),
+                Value::Image(test_img("foo")),
+            )],
+        );
+        let delta_layers = vec![
+            covered_delta.clone(),
+            visible_delta.clone(),
+            l0_delta.clone(),
+        ];
+
+        let image_layer = (
+            Lsn(0x40),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                test_img("bar"),
+            )],
+        );
+        let image_layers = vec![image_layer];
+
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline_with_layers(
+                TimelineId::generate(),
+                Lsn(0x10),
+                14,
+                &ctx,
+                delta_layers,
+                image_layers,
+                Lsn(0x100),
+            )
+            .await
+            .unwrap();
+
+        // Layer visibility is an input to heatmap generation, so refresh it first
+        timeline.update_layer_visibility().await.unwrap();
+
+        let heatmap = timeline
+            .generate_heatmap()
+            .await
+            .expect("Infallible while timeline is not shut down");
+
+        assert_eq!(heatmap.timeline_id, timeline.timeline_id);
+
+        // L0 should come last
+        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
+
+        let mut last_lsn = Lsn::MAX;
+        for layer in heatmap.layers {
+            // Covered layer should be omitted
+            assert!(layer.name != covered_delta.layer_name());
+
+            let layer_lsn = match &layer.name {
+                LayerName::Delta(d) => d.lsn_range.end,
+                LayerName::Image(i) => i.lsn,
+            };
+
+            // Apart from L0s, newest Layers should come first
+            if !LayerMap::is_l0(layer.name.key_range()) {
+                assert!(layer_lsn <= last_lsn);
+                last_lsn = layer_lsn;
+            }
+        }
+    }
+
     #[tokio::test]
     async fn two_layer_eviction_attempts_at_the_same_time() {
         let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")

From c6b6b7700a31dc945276ccd091d33373548f518c Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 16 Aug 2024 11:13:18 -0700
Subject: [PATCH 15/44] Fix superuser check in test_snap_files (#8749)

## Problem
Current superuser check always passes because it returns a tuple like
`(False,)`, and then the `if not superuser` passes.

## Summary of changes
Fixes the issue by unwrapping the tuple. Verified that it works against
a project where I don't have superuser.
---
 test_runner/performance/test_logical_replication.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index c4e42a7834..077f73ac06 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -282,15 +282,16 @@ def test_snap_files(
 
     env = benchmark_project_pub.pgbench_env
     connstr = benchmark_project_pub.connstr
-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
 
     with psycopg2.connect(connstr) as conn:
         conn.autocommit = True
         with conn.cursor() as cur:
             cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cur.fetchall()[0]
+            is_super = cur.fetchall()[0][0]
             assert is_super, "This benchmark won't work if we don't have superuser"
 
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
+
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
     cur = conn.cursor()

From 2be69af6c3a595c90f747dabe44fe898b59375c9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 16 Aug 2024 22:19:44 +0300
Subject: [PATCH 16/44] Track holes to be able to reuse them once LFC limit is
 increased (#8575)

## Problem

Multiple increase/decrease LFC limit may cause unlimited growth of LFC
file because punched holes while LFC shrinking are not reused when LFC
is extended.

## Summary of changes

Keep track of holes and reused them when LFC size is increased.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c                 | 117 ++++++++++++++++++++-----
 pgxn/neon/neon_pgversioncompat.h       |   4 +
 test_runner/regress/test_lfc_resize.py |  28 ++++--
 3 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 1894e8c72a..479209a537 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -41,6 +41,8 @@
 
 #include "hll.h"
 
+#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
+
 /*
  * Local file cache is used to temporary store relations pages in local file system.
  * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -51,19 +53,43 @@
  *
  * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
  * its consistency.
+
+ *
+ * ## Holes
+ *
+ * The LFC can be resized on the fly, up to a maximum size that's determined
+ * at server startup (neon.max_file_cache_size). After server startup, we
+ * expand the underlying file when needed, until it reaches the soft limit
+ * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
+ * the LFC by punching holes in the underlying file with a
+ * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
+ * shrink, but the disk space it uses does.
+ *
+ * Each hole is tracked by a dummy FileCacheEntry, which are kept in the
+ * 'holes' linked list. They are entered into the chunk hash table, with a
+ * special key where the blockNumber is used to store the 'offset' of the
+ * hole, and all other fields are zero. Holes are never looked up in the hash
+ * table, we only enter them there to have a FileCacheEntry that we can keep
+ * in the linked list. If the soft limit is raised again, we reuse the holes
+ * before extending the nominal size of the file.
  */
 
 /* Local file storage allocation chunk.
- * Should be power of two and not less than 32. Using larger than page chunks can
+ * Should be power of two. Using larger than page chunks can
  * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
  *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
  *    1Mb chunks can reduce hash map size to 320Mb.
  * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
  */
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
+/*
+ * Smaller chunk seems to be better for OLTP workload
+ */
+// #define BLOCKS_PER_CHUNK	8 /* 64kb chunk */
 #define MB					((uint64)1024*1024)
 
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
+#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
 
 typedef struct FileCacheEntry
 {
@@ -71,8 +97,8 @@ typedef struct FileCacheEntry
 	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
-	dlist_node	lru_node;		/* LRU list node */
+	uint32		bitmap[CHUNK_BITMAP_SIZE];
+	dlist_node	list_node;		/* LRU/holes list node */
 } FileCacheEntry;
 
 typedef struct FileCacheControl
@@ -87,6 +113,7 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
+	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;
 
@@ -135,6 +162,7 @@ lfc_disable(char const *op)
 		lfc_ctl->used = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
+		dlist_init(&lfc_ctl->holes);
 
 		if (lfc_desc > 0)
 		{
@@ -214,18 +242,18 @@ lfc_shmem_startup(void)
 	if (!found)
 	{
 		int			fd;
-		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		uint32		n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
 
 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
 
 		/*
-		 * lfc_size+1 because we add new element to hash table before eviction
+		 * n_chunks+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 lfc_size + 1, lfc_size + 1,
+								 n_chunks + 1, n_chunks + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -235,6 +263,7 @@ lfc_shmem_startup(void)
 		lfc_ctl->misses = 0;
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
+		dlist_init(&lfc_ctl->holes);
 
 		/* Initialize hyper-log-log structure for estimating working set size */
 		initSHLL(&lfc_ctl->wss_estimation);
@@ -310,14 +339,31 @@ lfc_change_limit_hook(int newval, void *extra)
 		 * Shrink cache by throwing away least recently accessed chunks and
 		 * returning their space to file system
 		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+		FileCacheEntry *hole;
+		uint32		offset = victim->offset;
+		uint32		hash;
+		bool		found;
+		BufferTag	holetag;
 
-		Assert(victim->access_count == 0);
+		CriticalAssert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
 		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
+		/* We remove the old entry, and re-enter a hole to the hash table */
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+
+		memset(&holetag, 0, sizeof(holetag));
+		holetag.blockNum = offset;
+		hash = get_hash_value(lfc_hash, &holetag);
+		hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
+		hole->hash = hash;
+		hole->offset = offset;
+		hole->access_count = 0;
+		CriticalAssert(!found);
+		dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
+
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
@@ -409,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -440,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	tag.forkNum = forkNum;
 	tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
 
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -470,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	{
 		bool		has_remaining_pages;
 
-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
+		for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
 		{
 			if (entry->bitmap[i] != 0)
 			{
@@ -485,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		 */
 		if (!has_remaining_pages)
 		{
-			dlist_delete(&entry->lru_node);
-			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
+			dlist_delete(&entry->list_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
 		}
 	}
 
@@ -525,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -551,7 +602,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	/* Unlink entry from LRU list to pin it for the duration of IO operation */
 	if (entry->access_count++ == 0)
-		dlist_delete(&entry->lru_node);
+		dlist_delete(&entry->list_node);
 	generation = lfc_ctl->generation;
 	entry_offset = entry->offset;
 
@@ -569,12 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	if (lfc_ctl->generation == generation)
 	{
-		Assert(LFC_ENABLED());
+		CriticalAssert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
 		pgBufferUsage.file_cache.hits += 1;
-		Assert(entry->access_count > 0);
+		CriticalAssert(entry->access_count > 0);
 		if (--entry->access_count == 0)
-			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
 	}
 	else
 		result = false;
@@ -613,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -632,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		 * operation
 		 */
 		if (entry->access_count++ == 0)
-			dlist_delete(&entry->lru_node);
+			dlist_delete(&entry->list_node);
 	}
 	else
 	{
@@ -655,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
 
-			Assert(victim->access_count == 0);
+			CriticalAssert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 			neon_log(DEBUG2, "Swap file cache page");
 		}
+		else if (!dlist_is_empty(&lfc_ctl->holes))
+		{
+			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
+			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
+			uint32		offset = hole->offset;
+			bool		found;
+
+			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
+			CriticalAssert(found);
+
+			lfc_ctl->used += 1;
+			entry->offset = offset;	/* reuse the hole */
+		}
 		else
 		{
 			lfc_ctl->used += 1;
@@ -689,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 
 		if (lfc_ctl->generation == generation)
 		{
-			Assert(LFC_ENABLED());
+			CriticalAssert(LFC_ENABLED());
 			/* Place entry to the head of LRU list */
-			Assert(entry->access_count > 0);
+			CriticalAssert(entry->access_count > 0);
 			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
 
 			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
 		}
@@ -708,7 +774,6 @@ typedef struct
 } NeonGetStatsCtx;
 
 #define NUM_NEON_GET_STATS_COLS	2
-#define NUM_NEON_GET_STATS_ROWS	3
 
 PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
 Datum
@@ -744,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 						   INT8OID, -1, 0);
 
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
-		funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
 		funcctx->user_fctx = fctx;
 
 		/* Return to original context when allocating transient memory */
@@ -778,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->writes;
 			break;
+		case 4:
+			key = "file_cache_size";
+			if (lfc_ctl)
+				value = lfc_ctl->size;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
@@ -901,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
+					for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index f19732cbbb..addb6ccce6 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -54,6 +54,10 @@
 
 #define BufTagGetNRelFileInfo(tag) tag.rnode
 
+#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
+
+#define InvalidRelFileNumber InvalidOid
+
 #define SMgrRelGetRelInfo(reln) \
 	(reln->smgr_rnode.node)
 
diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index 2a3442448a..1b2c7f808f 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -1,3 +1,7 @@
+import os
+import random
+import re
+import subprocess
 import threading
 import time
 
@@ -17,17 +21,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
         "test_lfc_resize",
         config_lines=[
             "neon.file_cache_path='file.cache'",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
+            "neon.max_file_cache_size=512MB",
+            "neon.file_cache_size_limit=512MB",
         ],
     )
     n_resize = 10
-    scale = 10
+    scale = 100
 
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
-        pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
+        pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
 
     thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
     thread.start()
@@ -35,9 +39,21 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
     conn = endpoint.connect()
     cur = conn.cursor()
 
-    for i in range(n_resize):
-        cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
+    for _ in range(n_resize):
+        size = random.randint(1, 512)
+        cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
         cur.execute("select pg_reload_conf()")
         time.sleep(1)
 
+    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
+    cur.execute("select pg_reload_conf()")
+
     thread.join()
+
+    lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
+    lfc_file_size = os.path.getsize(lfc_file_path)
+    res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
+    lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
+    log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+    assert lfc_file_size <= 512 * 1024 * 1024
+    assert int(lfc_file_blocks) <= 128 * 1024

From 7131ac4730f7268a8624a9c7345c23938cc8b6a6 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Sat, 17 Aug 2024 11:48:53 -0400
Subject: [PATCH 17/44] refactor(scrubber): add unified command suitable for
 cron job (#8635)

Part of #8128.

## Description

This PR creates a unified command to run both physical gc and metadata
health check as a cron job. This also enables us to add additional tasks
to the cron job in the future.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/main.rs                  | 240 ++++++++++++------
 .../src/pageserver_physical_gc.rs             |  16 +-
 .../src/scan_pageserver_metadata.rs           |   2 +-
 3 files changed, 175 insertions(+), 83 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index cbc836755a..3935e513e3 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -3,9 +3,10 @@ use camino::Utf8PathBuf;
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
 use reqwest::{Method, Url};
+use storage_controller_client::control_api;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
-use storage_scrubber::scan_pageserver_metadata::scan_metadata;
+use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
 use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
@@ -68,7 +69,7 @@ enum Command {
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
         #[arg(long = "post", default_value_t = false)]
-        post_to_storage_controller: bool,
+        post_to_storcon: bool,
         #[arg(long, default_value = None)]
         /// For safekeeper node_kind only, points to db with debug dump
         dump_db_connstr: Option<String>,
@@ -100,6 +101,16 @@ enum Command {
         #[arg(long = "concurrency", short = 'j', default_value_t = 64)]
         concurrency: usize,
     },
+    CronJob {
+        // PageserverPhysicalGc
+        #[arg(long = "min-age")]
+        gc_min_age: humantime::Duration,
+        #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
+        gc_mode: GcMode,
+        // ScanMetadata
+        #[arg(long = "post", default_value_t = false)]
+        post_to_storcon: bool,
+    },
 }
 
 #[tokio::main]
@@ -117,6 +128,7 @@ async fn main() -> anyhow::Result<()> {
         Command::TenantSnapshot { .. } => "tenant-snapshot",
         Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
         Command::FindLargeObjects { .. } => "find-large-objects",
+        Command::CronJob { .. } => "cron-job",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -126,12 +138,13 @@ async fn main() -> anyhow::Result<()> {
         chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
     ));
 
-    let controller_client_conf = cli.controller_api.map(|controller_api| {
+    let controller_client = cli.controller_api.map(|controller_api| {
         ControllerClientConfig {
             controller_api,
             // Default to no key: this is a convenience when working in a development environment
             controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
         }
+        .build_client()
     });
 
     match cli.command {
@@ -139,7 +152,7 @@ async fn main() -> anyhow::Result<()> {
             json,
             tenant_ids,
             node_kind,
-            post_to_storage_controller,
+            post_to_storcon,
             dump_db_connstr,
             dump_db_table,
         } => {
@@ -178,53 +191,14 @@ async fn main() -> anyhow::Result<()> {
                 }
                 Ok(())
             } else {
-                if controller_client_conf.is_none() && post_to_storage_controller {
-                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
-                }
-                match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                    Err(e) => {
-                        tracing::error!("Failed: {e}");
-                        Err(e)
-                    }
-                    Ok(summary) => {
-                        if json {
-                            println!("{}", serde_json::to_string(&summary).unwrap())
-                        } else {
-                            println!("{}", summary.summary_string());
-                        }
-
-                        if post_to_storage_controller {
-                            if let Some(conf) = controller_client_conf {
-                                let controller_client = conf.build_client();
-                                let body = summary.build_health_update_request();
-                                controller_client
-                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
-                                        Method::POST,
-                                        "control/v1/metadata_health/update".to_string(),
-                                        Some(body),
-                                    )
-                                    .await?;
-                            }
-                        }
-
-                        if summary.is_fatal() {
-                            tracing::error!("Fatal scrub errors detected");
-                        } else if summary.is_empty() {
-                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                            // scrubber they were likely expecting to scan something, and if we see no timelines
-                            // at all then it's likely due to some configuration issues like a bad prefix
-                            tracing::error!(
-                                "No timelines found in bucket {} prefix {}",
-                                bucket_config.bucket,
-                                bucket_config
-                                    .prefix_in_bucket
-                                    .unwrap_or("<none>".to_string())
-                            );
-                        }
-
-                        Ok(())
-                    }
-                }
+                scan_pageserver_metadata_cmd(
+                    bucket_config,
+                    controller_client.as_ref(),
+                    tenant_ids,
+                    json,
+                    post_to_storcon,
+                )
+                .await
             }
         }
         Command::FindGarbage {
@@ -254,31 +228,14 @@ async fn main() -> anyhow::Result<()> {
             min_age,
             mode,
         } => {
-            match (&controller_client_conf, mode) {
-                (Some(_), _) => {
-                    // Any mode may run when controller API is set
-                }
-                (None, GcMode::Full) => {
-                    // The part of physical GC where we erase ancestor layers cannot be done safely without
-                    // confirming the most recent complete shard split with the controller.  Refuse to run, rather
-                    // than doing it unsafely.
-                    return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
-                }
-                (None, GcMode::DryRun | GcMode::IndicesOnly) => {
-                    // These GcModes do not require the controller to run.
-                }
-            }
-
-            let summary = pageserver_physical_gc(
-                bucket_config,
-                controller_client_conf,
+            pageserver_physical_gc_cmd(
+                &bucket_config,
+                controller_client.as_ref(),
                 tenant_ids,
-                min_age.into(),
+                min_age,
                 mode,
             )
-            .await?;
-            println!("{}", serde_json::to_string(&summary).unwrap());
-            Ok(())
+            .await
         }
         Command::FindLargeObjects {
             min_size,
@@ -295,5 +252,142 @@ async fn main() -> anyhow::Result<()> {
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
+        Command::CronJob {
+            gc_min_age,
+            gc_mode,
+            post_to_storcon,
+        } => {
+            run_cron_job(
+                bucket_config,
+                controller_client.as_ref(),
+                gc_min_age,
+                gc_mode,
+                post_to_storcon,
+            )
+            .await
+        }
+    }
+}
+
+/// Runs the scrubber cron job.
+/// 1. Do pageserver physical gc
+/// 2. Scan pageserver metadata
+pub async fn run_cron_job(
+    bucket_config: BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    gc_min_age: humantime::Duration,
+    gc_mode: GcMode,
+    post_to_storcon: bool,
+) -> anyhow::Result<()> {
+    tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
+    pageserver_physical_gc_cmd(
+        &bucket_config,
+        controller_client,
+        Vec::new(),
+        gc_min_age,
+        gc_mode,
+    )
+    .await?;
+    tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata");
+    scan_pageserver_metadata_cmd(
+        bucket_config,
+        controller_client,
+        Vec::new(),
+        true,
+        post_to_storcon,
+    )
+    .await?;
+
+    Ok(())
+}
+
+pub async fn pageserver_physical_gc_cmd(
+    bucket_config: &BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    tenant_shard_ids: Vec<TenantShardId>,
+    min_age: humantime::Duration,
+    mode: GcMode,
+) -> anyhow::Result<()> {
+    match (controller_client, mode) {
+        (Some(_), _) => {
+            // Any mode may run when controller API is set
+        }
+        (None, GcMode::Full) => {
+            // The part of physical GC where we erase ancestor layers cannot be done safely without
+            // confirming the most recent complete shard split with the controller.  Refuse to run, rather
+            // than doing it unsafely.
+            return Err(anyhow!(
+                "Full physical GC requires `--controller-api` and `--controller-jwt` to run"
+            ));
+        }
+        (None, GcMode::DryRun | GcMode::IndicesOnly) => {
+            // These GcModes do not require the controller to run.
+        }
+    }
+
+    let summary = pageserver_physical_gc(
+        bucket_config,
+        controller_client,
+        tenant_shard_ids,
+        min_age.into(),
+        mode,
+    )
+    .await?;
+    println!("{}", serde_json::to_string(&summary).unwrap());
+    Ok(())
+}
+
+pub async fn scan_pageserver_metadata_cmd(
+    bucket_config: BucketConfig,
+    controller_client: Option<&control_api::Client>,
+    tenant_shard_ids: Vec<TenantShardId>,
+    json: bool,
+    post_to_storcon: bool,
+) -> anyhow::Result<()> {
+    if controller_client.is_none() && post_to_storcon {
+        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+    }
+    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
+        Err(e) => {
+            tracing::error!("Failed: {e}");
+            Err(e)
+        }
+        Ok(summary) => {
+            if json {
+                println!("{}", serde_json::to_string(&summary).unwrap())
+            } else {
+                println!("{}", summary.summary_string());
+            }
+
+            if post_to_storcon {
+                if let Some(client) = controller_client {
+                    let body = summary.build_health_update_request();
+                    client
+                        .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
+                            Method::POST,
+                            "control/v1/metadata_health/update".to_string(),
+                            Some(body),
+                        )
+                        .await?;
+                }
+            }
+
+            if summary.is_fatal() {
+                tracing::error!("Fatal scrub errors detected");
+            } else if summary.is_empty() {
+                // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                // scrubber they were likely expecting to scan something, and if we see no timelines
+                // at all then it's likely due to some configuration issues like a bad prefix
+                tracing::error!(
+                    "No timelines found in bucket {} prefix {}",
+                    bucket_config.bucket,
+                    bucket_config
+                        .prefix_in_bucket
+                        .unwrap_or("<none>".to_string())
+                );
+            }
+
+            Ok(())
+        }
     }
 }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index c8b1ed49f4..20d9bd6dd4 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,9 +4,7 @@ use std::time::{Duration, SystemTime};
 
 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{
-    init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
-};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -473,8 +471,8 @@ async fn gc_ancestor(
 /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
 pub async fn pageserver_physical_gc(
-    bucket_config: BucketConfig,
-    controller_client_conf: Option<ControllerClientConfig>,
+    bucket_config: &BucketConfig,
+    controller_client: Option<&control_api::Client>,
     tenant_shard_ids: Vec<TenantShardId>,
     min_age: Duration,
     mode: GcMode,
@@ -558,7 +556,7 @@ pub async fn pageserver_physical_gc(
         let timelines = timelines.map_ok(|ttid| {
             gc_timeline(
                 &s3_client,
-                &bucket_config,
+                bucket_config,
                 &min_age,
                 &target,
                 mode,
@@ -574,7 +572,7 @@ pub async fn pageserver_physical_gc(
     }
 
     // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
+    let Some(client) = controller_client else {
         tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
         return Ok(summary);
     };
@@ -583,13 +581,13 @@ pub async fn pageserver_physical_gc(
         .unwrap()
         .into_inner()
         .unwrap()
-        .into_gc_ancestors(&controller_client, &mut summary)
+        .into_gc_ancestors(client, &mut summary)
         .await;
 
     for ancestor_shard in ancestor_shards {
         gc_ancestor(
             &s3_client,
-            &bucket_config,
+            bucket_config,
             &target,
             &min_age,
             ancestor_shard,
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index b9630056e1..2409b7b132 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -116,7 +116,7 @@ Index versions: {version_summary}
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_metadata(
+pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {

From 188bde7f0776636310260cbf636922d1029add7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sun, 18 Aug 2024 19:32:10 +0200
Subject: [PATCH 18/44] Default image compression to zstd at level 1 (#8677)

After the rollout has succeeded, we now set the default image
compression to be enabled.

We also remove its explicit mention from `neon_fixtures.py` added in
#8368 as it is now the default (and we switch to `zstd(1)` which is a
bit nicer on CPU time).

Part of https://github.com/neondatabase/neon/issues/5431
---
 pageserver/src/config.rs                            | 12 +++++-------
 pageserver/src/tenant/storage_layer/split_writer.rs |  7 ++++++-
 test_runner/fixtures/neon_fixtures.py               |  1 -
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 3ac5ac539f..0ebaf78840 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -50,7 +50,6 @@ pub mod defaults {
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
-    use pageserver_api::models::ImageCompressionAlgorithm;
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
@@ -90,8 +89,7 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
-    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Disabled;
+    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
 
@@ -478,7 +476,7 @@ impl PageServerConfigBuilder {
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: Set(L0FlushConfig::default()),
             compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -1065,7 +1063,7 @@ impl PageServerConf {
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
             l0_flush: L0FlushConfig::default(),
             compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1305,7 +1303,7 @@ background_task_maximum_delay = '334 s'
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1378,7 +1376,7 @@ background_task_maximum_delay = '334 s'
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                 l0_flush: L0FlushConfig::default(),
                 compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index d7bfe48c60..e12e29cd45 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -208,6 +208,8 @@ impl SplitDeltaLayerWriter {
 
 #[cfg(test)]
 mod tests {
+    use rand::{RngCore, SeedableRng};
+
     use crate::{
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
@@ -229,7 +231,10 @@ mod tests {
     }
 
     fn get_large_img() -> Bytes {
-        vec![0; 8192].into()
+        let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
+        let mut data = vec![0; 8192];
+        rng.fill_bytes(&mut data);
+        data.into()
     }
 
     #[tokio::test]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ec5a83601e..ba6fbc003a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1162,7 +1162,6 @@ class NeonEnv:
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
-                "image_compression": "zstd",
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine

From f246aa3ca7f19993e0582dfd8069375c09c5158c Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 19 Aug 2024 10:33:46 +0200
Subject: [PATCH 19/44] proxy: Fix some warnings by extended clippy checks
 (#8748)

* Missing blank lifetimes which is now deprecated.
* Matching off unqualified enum variants that could act like variable.
* Missing semicolons.
---
 proxy/src/auth.rs                             |  46 +++--
 proxy/src/auth/backend.rs                     |  72 +++-----
 proxy/src/auth/backend/jwt.rs                 |   6 +-
 proxy/src/auth/credentials.rs                 |  17 +-
 proxy/src/cache/common.rs                     |   2 +-
 proxy/src/cache/timed_lru.rs                  |   2 +-
 proxy/src/compute.rs                          |  23 ++-
 proxy/src/config.rs                           |   6 +-
 proxy/src/console/messages.rs                 |  22 ++-
 proxy/src/console/mgmt.rs                     |   3 +-
 proxy/src/console/provider.rs                 | 170 +++++++++---------
 proxy/src/context.rs                          |   8 +-
 proxy/src/context/parquet.rs                  |   2 +-
 proxy/src/intern.rs                           |   2 +-
 proxy/src/metrics.rs                          |   2 +-
 proxy/src/proxy/copy_bidirectional.rs         |   2 +-
 proxy/src/proxy/handshake.rs                  |  15 +-
 proxy/src/proxy/tests/mitm.rs                 |   4 +-
 proxy/src/rate_limiter/limit_algorithm.rs     |   4 +-
 .../src/rate_limiter/limit_algorithm/aimd.rs  |   5 +-
 .../connection_with_credentials_provider.rs   |   2 +-
 proxy/src/redis/notifications.rs              |  23 ++-
 proxy/src/sasl.rs                             |   5 +-
 proxy/src/sasl/channel_binding.rs             |  21 +--
 proxy/src/sasl/messages.rs                    |   5 +-
 proxy/src/scram.rs                            |   4 +-
 proxy/src/scram/countmin.rs                   |   2 -
 proxy/src/scram/exchange.rs                   |  22 +--
 proxy/src/scram/messages.rs                   |   6 +-
 proxy/src/scram/pbkdf2.rs                     |   2 +-
 proxy/src/scram/threadpool.rs                 |   4 +-
 proxy/src/serverless.rs                       |   4 +-
 proxy/src/serverless/conn_pool.rs             |   6 +-
 proxy/src/stream.rs                           |   2 +-
 proxy/src/url.rs                              |   2 +-
 proxy/src/waiters.rs                          |   2 +-
 36 files changed, 246 insertions(+), 279 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 8c44823c98..3b3c571129 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -113,38 +113,36 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
 
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
-        use AuthErrorImpl::*;
         match self.0.as_ref() {
-            Link(e) => e.to_string_client(),
-            GetAuthInfo(e) => e.to_string_client(),
-            Sasl(e) => e.to_string_client(),
-            AuthFailed(_) => self.to_string(),
-            BadAuthMethod(_) => self.to_string(),
-            MalformedPassword(_) => self.to_string(),
-            MissingEndpointName => self.to_string(),
-            Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed(_) => self.to_string(),
-            TooManyConnections => self.to_string(),
-            UserTimeout(_) => self.to_string(),
+            AuthErrorImpl::Link(e) => e.to_string_client(),
+            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
+            AuthErrorImpl::Sasl(e) => e.to_string_client(),
+            AuthErrorImpl::AuthFailed(_) => self.to_string(),
+            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
+            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
+            AuthErrorImpl::MissingEndpointName => self.to_string(),
+            AuthErrorImpl::Io(_) => "Internal error".to_string(),
+            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
+            AuthErrorImpl::TooManyConnections => self.to_string(),
+            AuthErrorImpl::UserTimeout(_) => self.to_string(),
         }
     }
 }
 
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
-        use AuthErrorImpl::*;
         match self.0.as_ref() {
-            Link(e) => e.get_error_kind(),
-            GetAuthInfo(e) => e.get_error_kind(),
-            Sasl(e) => e.get_error_kind(),
-            AuthFailed(_) => crate::error::ErrorKind::User,
-            BadAuthMethod(_) => crate::error::ErrorKind::User,
-            MalformedPassword(_) => crate::error::ErrorKind::User,
-            MissingEndpointName => crate::error::ErrorKind::User,
-            Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            TooManyConnections => crate::error::ErrorKind::RateLimit,
-            UserTimeout(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::Link(e) => e.get_error_kind(),
+            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
+            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
+            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
+            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
         }
     }
 }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index c6a0b2af5a..7592d076ec 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -80,9 +80,8 @@ pub trait TestBackend: Send + Sync + 'static {
 
 impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use BackendType::*;
         match self {
-            Console(api, _) => match &**api {
+            Self::Console(api, _) => match &**api {
                 ConsoleBackend::Console(endpoint) => {
                     fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                 }
@@ -93,7 +92,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
         }
     }
 }
@@ -102,10 +101,9 @@ impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
     pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
-        use BackendType::*;
         match self {
-            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
+            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
+            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
         }
     }
 }
@@ -115,10 +113,9 @@ impl<'a, T, D> BackendType<'a, T, D> {
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
     pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
-        use BackendType::*;
         match self {
-            Console(c, x) => Console(c, f(x)),
-            Link(c, x) => Link(c, x),
+            Self::Console(c, x) => BackendType::Console(c, f(x)),
+            Self::Link(c, x) => BackendType::Link(c, x),
         }
     }
 }
@@ -126,10 +123,9 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
     pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
-        use BackendType::*;
         match self {
-            Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c, x) => Ok(Link(c, x)),
+            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
+            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
         }
     }
 }
@@ -293,7 +289,9 @@ async fn auth_quirks(
             ctx.set_endpoint_id(res.info.endpoint.clone());
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
-                _ => unreachable!("password hack should return a password"),
+                ComputeCredentialKeys::AuthKeys(_) => {
+                    unreachable!("password hack should return a password")
+                }
             };
             (res.info, Some(password))
         }
@@ -400,21 +398,17 @@ async fn authenticate_with_secret(
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
-        use BackendType::*;
-
         match self {
-            Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_, _) => Some("link".into()),
+            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
+            Self::Link(_, _) => Some("link".into()),
         }
     }
 
     /// Get username from the credentials.
     pub fn get_user(&self) -> &str {
-        use BackendType::*;
-
         match self {
-            Console(_, user_info) => &user_info.user,
-            Link(_, _) => "link",
+            Self::Console(_, user_info) => &user_info.user,
+            Self::Link(_, _) => "link",
         }
     }
 
@@ -428,10 +422,8 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         config: &'static AuthenticationConfig,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
-        use BackendType::*;
-
         let res = match self {
-            Console(api, user_info) => {
+            Self::Console(api, user_info) => {
                 info!(
                     user = &*user_info.user,
                     project = user_info.endpoint(),
@@ -451,7 +443,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Link(url, _) => {
+            Self::Link(url, _) => {
                 info!("performing link authentication");
 
                 let info = link::authenticate(ctx, &url, client).await?;
@@ -470,10 +462,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -481,10 +472,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
@@ -495,18 +485,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
         match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
         }
     }
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
         match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
         }
     }
 }
@@ -517,18 +505,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
         match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
         }
     }
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
         match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 0c2ca8fb97..e021a7e23f 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -195,7 +195,7 @@ impl JwkCacheEntryLock {
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader>(&header)
+        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
@@ -340,7 +340,7 @@ impl JwkRenewalPermit<'_> {
         }
     }
 
-    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
         match from.lookup.acquire().await {
             Ok(permit) => {
                 permit.forget();
@@ -352,7 +352,7 @@ impl JwkRenewalPermit<'_> {
         }
     }
 
-    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
         match from.lookup.try_acquire() {
             Ok(permit) => {
                 permit.forget();
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 8f4a392131..849e7d65e8 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -89,10 +89,12 @@ impl ComputeUserInfoMaybeEndpoint {
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
     ) -> Result<Self, ComputeUserInfoParseError> {
-        use ComputeUserInfoParseError::*;
-
         // Some parameters are stored in the startup message.
-        let get_param = |key| params.get(key).ok_or(MissingKey(key));
+        let get_param = |key| {
+            params
+                .get(key)
+                .ok_or(ComputeUserInfoParseError::MissingKey(key))
+        };
         let user: RoleName = get_param("user")?.into();
 
         // Project name might be passed via PG's command-line options.
@@ -122,11 +124,14 @@ impl ComputeUserInfoMaybeEndpoint {
         let endpoint = match (endpoint_option, endpoint_from_domain) {
             // Invariant: if we have both project name variants, they should match.
             (Some(option), Some(domain)) if option != domain => {
-                Some(Err(InconsistentProjectNames { domain, option }))
+                Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
+                    domain,
+                    option,
+                }))
             }
             // Invariant: project name may not contain certain characters.
             (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(MalformedProjectName(name)),
+                false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
@@ -186,7 +191,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
         impl<'de> serde::de::Visitor<'de> for StrVisitor {
             type Value = IpPattern;
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
             }
 
diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index 4e393fddb2..82c78e3eb2 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -24,7 +24,7 @@ impl<C: Cache> Cache for &C {
     type LookupInfo<Key> = C::LookupInfo<Key>;
 
     fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
-        C::invalidate(self, info)
+        C::invalidate(self, info);
     }
 }
 
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index c5c4f6a1ed..07fad56643 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -58,7 +58,7 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
     type LookupInfo<Key> = LookupInfo<Key>;
 
     fn invalidate(&self, info: &Self::LookupInfo<K>) {
-        self.invalidate_raw(info)
+        self.invalidate_raw(info);
     }
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 18c82fe379..c071a59d58 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -44,11 +44,10 @@ pub enum ConnectionError {
 
 impl UserFacingError for ConnectionError {
     fn to_string_client(&self) -> String {
-        use ConnectionError::*;
         match self {
             // This helps us drop irrelevant library-specific prefixes.
             // TODO: propagate severity level and other parameters.
-            Postgres(err) => match err.as_db_error() {
+            ConnectionError::Postgres(err) => match err.as_db_error() {
                 Some(err) => {
                     let msg = err.message();
 
@@ -62,8 +61,8 @@ impl UserFacingError for ConnectionError {
                 }
                 None => err.to_string(),
             },
-            WakeComputeError(err) => err.to_string_client(),
-            TooManyConnectionAttempts(_) => {
+            ConnectionError::WakeComputeError(err) => err.to_string_client(),
+            ConnectionError::TooManyConnectionAttempts(_) => {
                 "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
             }
             _ => COULD_NOT_CONNECT.to_owned(),
@@ -366,16 +365,16 @@ static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
 struct AcceptEverythingVerifier;
 impl ServerCertVerifier for AcceptEverythingVerifier {
     fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme::*;
+        use rustls::SignatureScheme;
         // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
         vec![
-            ECDSA_NISTP521_SHA512,
-            ECDSA_NISTP384_SHA384,
-            ECDSA_NISTP256_SHA256,
-            RSA_PSS_SHA512,
-            RSA_PSS_SHA384,
-            RSA_PSS_SHA256,
-            ED25519,
+            SignatureScheme::ECDSA_NISTP521_SHA512,
+            SignatureScheme::ECDSA_NISTP384_SHA384,
+            SignatureScheme::ECDSA_NISTP256_SHA256,
+            SignatureScheme::RSA_PSS_SHA512,
+            SignatureScheme::RSA_PSS_SHA384,
+            SignatureScheme::RSA_PSS_SHA256,
+            SignatureScheme::ED25519,
         ]
     }
     fn verify_server_cert(
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1412095505..36d04924f2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -155,7 +155,7 @@ pub enum TlsServerEndPoint {
 }
 
 impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
         let sha256_oids = [
             // I'm explicitly not adding MD5 or SHA1 here... They're bad.
             oid_registry::OID_SIG_ECDSA_WITH_SHA256,
@@ -278,7 +278,7 @@ impl CertResolver {
 impl rustls::server::ResolvesServerCert for CertResolver {
     fn resolve(
         &self,
-        client_hello: rustls::server::ClientHello,
+        client_hello: rustls::server::ClientHello<'_>,
     ) -> Option<Arc<rustls::sign::CertifiedKey>> {
         self.resolve(client_hello.server_name()).map(|x| x.0)
     }
@@ -559,7 +559,7 @@ impl RetryConfig {
             match key {
                 "num_retries" => num_retries = Some(value.parse()?),
                 "base_retry_wait_duration" => {
-                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?);
                 }
                 "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
                 unknown => bail!("unknown key: {unknown}"),
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 9abf24ab7f..ac66e116d0 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -22,16 +22,15 @@ impl ConsoleError {
         self.status
             .as_ref()
             .and_then(|s| s.details.error_info.as_ref())
-            .map(|e| e.reason)
-            .unwrap_or(Reason::Unknown)
+            .map_or(Reason::Unknown, |e| e.reason)
     }
+
     pub fn get_user_facing_message(&self) -> String {
         use super::provider::errors::REQUEST_FAILED;
         self.status
             .as_ref()
             .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.clone().into())
-            .unwrap_or_else(|| {
+            .map_or_else(|| {
                 // Ask @neondatabase/control-plane for review before adding more.
                 match self.http_status_code {
                     http::StatusCode::NOT_FOUND => {
@@ -48,19 +47,18 @@ impl ConsoleError {
                     }
                     _ => REQUEST_FAILED.to_owned(),
                 }
-            })
+            }, |m| m.message.clone().into())
     }
 }
 
 impl Display for ConsoleError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let msg = self
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let msg: &str = self
             .status
             .as_ref()
             .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.as_ref())
-            .unwrap_or_else(|| &self.error);
-        write!(f, "{}", msg)
+            .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref());
+        write!(f, "{msg}")
     }
 }
 
@@ -286,7 +284,7 @@ pub struct DatabaseInfo {
 
 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for DatabaseInfo {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("DatabaseInfo")
             .field("host", &self.host)
             .field("port", &self.port)
@@ -373,7 +371,7 @@ mod tests {
                 }
             }
         });
-        let _: KickSession = serde_json::from_str(&json.to_string())?;
+        let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;
 
         Ok(())
     }
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index befe7d7510..82d5033aab 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -93,7 +93,8 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
 }
 
 fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
-    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
+    let resp: KickSession<'_> =
+        serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 15fc0134b3..cc2ee10062 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -26,7 +26,7 @@ use tracing::info;
 pub mod errors {
     use crate::{
         console::messages::{self, ConsoleError, Reason},
-        error::{io_error, ReportableError, UserFacingError},
+        error::{io_error, ErrorKind, ReportableError, UserFacingError},
         proxy::retry::CouldRetry,
     };
     use thiserror::Error;
@@ -51,21 +51,19 @@ pub mod errors {
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
         pub fn get_reason(&self) -> messages::Reason {
-            use ApiError::*;
             match self {
-                Console(e) => e.get_reason(),
-                _ => messages::Reason::Unknown,
+                ApiError::Console(e) => e.get_reason(),
+                ApiError::Transport(_) => messages::Reason::Unknown,
             }
         }
     }
 
     impl UserFacingError for ApiError {
         fn to_string_client(&self) -> String {
-            use ApiError::*;
             match self {
                 // To minimize risks, only select errors are forwarded to users.
-                Console(c) => c.get_user_facing_message(),
-                _ => REQUEST_FAILED.to_owned(),
+                ApiError::Console(c) => c.get_user_facing_message(),
+                ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
             }
         }
     }
@@ -73,57 +71,53 @@ pub mod errors {
     impl ReportableError for ApiError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                ApiError::Console(e) => {
-                    use crate::error::ErrorKind::*;
-                    match e.get_reason() {
-                        Reason::RoleProtected => User,
-                        Reason::ResourceNotFound => User,
-                        Reason::ProjectNotFound => User,
-                        Reason::EndpointNotFound => User,
-                        Reason::BranchNotFound => User,
-                        Reason::RateLimitExceeded => ServiceRateLimit,
-                        Reason::NonDefaultBranchComputeTimeExceeded => User,
-                        Reason::ActiveTimeQuotaExceeded => User,
-                        Reason::ComputeTimeQuotaExceeded => User,
-                        Reason::WrittenDataQuotaExceeded => User,
-                        Reason::DataTransferQuotaExceeded => User,
-                        Reason::LogicalSizeQuotaExceeded => User,
-                        Reason::ConcurrencyLimitReached => ControlPlane,
-                        Reason::LockAlreadyTaken => ControlPlane,
-                        Reason::RunningOperations => ControlPlane,
-                        Reason::Unknown => match &e {
-                            ConsoleError {
-                                http_status_code:
-                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                                ..
-                            } => crate::error::ErrorKind::User,
-                            ConsoleError {
-                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                                error,
-                                ..
-                            } if error.contains(
-                                "compute time quota of non-primary branches is exceeded",
-                            ) =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::LOCKED,
-                                error,
-                                ..
-                            } if error.contains("quota exceeded")
-                                || error.contains("the limit for current plan reached") =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                                ..
-                            } => crate::error::ErrorKind::ServiceRateLimit,
-                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
-                        },
-                    }
-                }
+                ApiError::Console(e) => match e.get_reason() {
+                    Reason::RoleProtected => ErrorKind::User,
+                    Reason::ResourceNotFound => ErrorKind::User,
+                    Reason::ProjectNotFound => ErrorKind::User,
+                    Reason::EndpointNotFound => ErrorKind::User,
+                    Reason::BranchNotFound => ErrorKind::User,
+                    Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
+                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User,
+                    Reason::ActiveTimeQuotaExceeded => ErrorKind::User,
+                    Reason::ComputeTimeQuotaExceeded => ErrorKind::User,
+                    Reason::WrittenDataQuotaExceeded => ErrorKind::User,
+                    Reason::DataTransferQuotaExceeded => ErrorKind::User,
+                    Reason::LogicalSizeQuotaExceeded => ErrorKind::User,
+                    Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
+                    Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
+                    Reason::RunningOperations => ErrorKind::ControlPlane,
+                    Reason::Unknown => match &e {
+                        ConsoleError {
+                            http_status_code:
+                                http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                            ..
+                        } => crate::error::ErrorKind::User,
+                        ConsoleError {
+                            http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                            error,
+                            ..
+                        } if error
+                            .contains("compute time quota of non-primary branches is exceeded") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::LOCKED,
+                            error,
+                            ..
+                        } if error.contains("quota exceeded")
+                            || error.contains("the limit for current plan reached") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
+                            ..
+                        } => crate::error::ErrorKind::ServiceRateLimit,
+                        ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
+                    },
+                },
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
         }
@@ -170,12 +164,11 @@ pub mod errors {
 
     impl UserFacingError for GetAuthInfoError {
         fn to_string_client(&self) -> String {
-            use GetAuthInfoError::*;
             match self {
                 // We absolutely should not leak any secrets!
-                BadSecret => REQUEST_FAILED.to_owned(),
+                Self::BadSecret => REQUEST_FAILED.to_owned(),
                 // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
             }
         }
     }
@@ -183,8 +176,8 @@ pub mod errors {
     impl ReportableError for GetAuthInfoError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
-                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+                Self::BadSecret => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
             }
         }
     }
@@ -213,17 +206,16 @@ pub mod errors {
 
     impl UserFacingError for WakeComputeError {
         fn to_string_client(&self) -> String {
-            use WakeComputeError::*;
             match self {
                 // We shouldn't show user the address even if it's broken.
                 // Besides, user is unlikely to care about this detail.
-                BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
+                Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                 // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
 
-                TooManyConnections => self.to_string(),
+                Self::TooManyConnections => self.to_string(),
 
-                TooManyConnectionAttempts(_) => {
+                Self::TooManyConnectionAttempts(_) => {
                     "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
                 }
             }
@@ -233,10 +225,10 @@ pub mod errors {
     impl ReportableError for WakeComputeError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
-                WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
-                WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
+                Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(e) => e.get_error_kind(),
+                Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+                Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
             }
         }
     }
@@ -244,10 +236,10 @@ pub mod errors {
     impl CouldRetry for WakeComputeError {
         fn could_retry(&self) -> bool {
             match self {
-                WakeComputeError::BadComputeAddress(_) => false,
-                WakeComputeError::ApiError(e) => e.could_retry(),
-                WakeComputeError::TooManyConnections => false,
-                WakeComputeError::TooManyConnectionAttempts(_) => false,
+                Self::BadComputeAddress(_) => false,
+                Self::ApiError(e) => e.could_retry(),
+                Self::TooManyConnections => false,
+                Self::TooManyConnectionAttempts(_) => false,
             }
         }
     }
@@ -366,13 +358,14 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(test)]
-            Test(_) => unreachable!("this function should never be called in the test backend"),
+            Self::Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
         }
     }
 
@@ -381,13 +374,12 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(test)]
-            Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips_and_secret(),
         }
     }
 
@@ -396,14 +388,12 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
-        use ConsoleBackend::*;
-
         match self {
-            Console(api) => api.wake_compute(ctx, user_info).await,
+            Self::Console(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.wake_compute(ctx, user_info).await,
+            Self::Postgres(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(test)]
-            Test(api) => api.wake_compute(),
+            Self::Test(api) => api.wake_compute(),
         }
     }
 }
@@ -549,7 +539,7 @@ impl WakeComputePermit {
         !self.permit.is_disabled()
     }
     pub fn release(self, outcome: Outcome) {
-        self.permit.release(outcome)
+        self.permit.release(outcome);
     }
     pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
         match res {
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index e925f67233..cafbdedc15 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -166,7 +166,7 @@ impl RequestMonitoring {
     pub fn set_project(&self, x: MetricsAuxInfo) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         if this.endpoint_id.is_none() {
-            this.set_endpoint_id(x.endpoint_id.as_str().into())
+            this.set_endpoint_id(x.endpoint_id.as_str().into());
         }
         this.branch = Some(x.branch_id);
         this.project = Some(x.project_id);
@@ -260,7 +260,7 @@ impl RequestMonitoring {
             .cold_start_info
     }
 
-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
         LatencyTimerPause {
             ctx: self,
             start: tokio::time::Instant::now(),
@@ -273,7 +273,7 @@ impl RequestMonitoring {
             .try_lock()
             .expect("should not deadlock")
             .latency_timer
-            .success()
+            .success();
     }
 }
 
@@ -328,7 +328,7 @@ impl RequestMonitoringInner {
     fn has_private_peer_addr(&self) -> bool {
         match self.peer_addr {
             IpAddr::V4(ip) => ip.is_private(),
-            _ => false,
+            IpAddr::V6(_) => false,
         }
     }
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index bb02a476fc..e5962b35fa 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -736,7 +736,7 @@ mod tests {
                 while let Some(r) = s.next().await {
                     tx.send(r).unwrap();
                 }
-                time::sleep(time::Duration::from_secs(70)).await
+                time::sleep(time::Duration::from_secs(70)).await;
             }
         });
 
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index e38135dd22..d418caa511 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -56,7 +56,7 @@ impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
         impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
             type Value = InternedString<Id>;
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 formatter.write_str("a string")
             }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 0167553e30..ccef88231b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -252,7 +252,7 @@ impl Drop for HttpEndpointPoolsGuard<'_> {
 }
 
 impl HttpEndpointPools {
-    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> {
         self.http_pool_endpoints_registered_total.inc();
         HttpEndpointPoolsGuard {
             dec: &self.http_pool_endpoints_unregistered_total,
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 3c45fff969..048523f69c 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -184,7 +184,7 @@ impl CopyBuffer {
                 }
                 Poll::Pending
             }
-            res => res.map_err(ErrorDirection::Write),
+            res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write),
         }
     }
 
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index c65a5558d9..27a72f8072 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -82,9 +82,8 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
-        use FeStartupPacket::*;
         match msg {
-            SslRequest { direct } => match stream.get_ref() {
+            FeStartupPacket::SslRequest { direct } => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_ssl => {
                     tried_ssl = true;
 
@@ -139,7 +138,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                         let tls_stream = accept.await.inspect_err(|_| {
                             if record_handshake_error {
-                                Metrics::get().proxy.tls_handshake_failures.inc()
+                                Metrics::get().proxy.tls_handshake_failures.inc();
                             }
                         })?;
 
@@ -182,7 +181,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            GssEncRequest => match stream.get_ref() {
+            FeStartupPacket::GssEncRequest => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_gss => {
                     tried_gss = true;
 
@@ -191,7 +190,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                 if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
             {
                 // Check that the config has been consumed during upgrade
@@ -211,7 +210,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 break Ok(HandshakeData::Startup(stream, params));
             }
             // downgrade protocol version
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                 if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
             {
                 warn!(?version, "unsupported minor version");
@@ -241,7 +240,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 );
                 break Ok(HandshakeData::Startup(stream, params));
             }
-            StartupMessage { version, .. } => {
+            FeStartupPacket::StartupMessage { version, .. } => {
                 warn!(
                     ?version,
                     session_type = "normal",
@@ -249,7 +248,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 );
                 return Err(HandshakeError::ProtocolViolation);
             }
-            CancelRequest(cancel_key_data) => {
+            FeStartupPacket::CancelRequest(cancel_key_data) => {
                 info!(session_type = "cancellation", "successful handshake");
                 break Ok(HandshakeData::Cancel(cancel_key_data));
             }
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index c8ec2b2db6..2d752b9183 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -68,7 +68,7 @@ async fn proxy_mitm(
                                 end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap();
                                 continue;
                             }
-                            end_client.send(message).await.unwrap()
+                            end_client.send(message).await.unwrap();
                         }
                         _ => break,
                     }
@@ -88,7 +88,7 @@ async fn proxy_mitm(
                                 end_server.send(buf.freeze()).await.unwrap();
                                 continue;
                             }
-                            end_server.send(message).await.unwrap()
+                            end_server.send(message).await.unwrap();
                         }
                         _ => break,
                     }
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 3842ce269e..80a62b2a76 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -237,7 +237,7 @@ impl Token {
     }
 
     pub fn release(mut self, outcome: Outcome) {
-        self.release_mut(Some(outcome))
+        self.release_mut(Some(outcome));
     }
 
     pub fn release_mut(&mut self, outcome: Option<Outcome>) {
@@ -249,7 +249,7 @@ impl Token {
 
 impl Drop for Token {
     fn drop(&mut self) {
-        self.release_mut(None)
+        self.release_mut(None);
     }
 }
 
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index b39740bb21..d669492fa6 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -25,9 +25,8 @@ pub struct Aimd {
 
 impl LimitAlgorithm for Aimd {
     fn update(&self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
         match sample.outcome {
-            Success => {
+            Outcome::Success => {
                 let utilisation = sample.in_flight as f32 / old_limit as f32;
 
                 if utilisation > self.utilisation {
@@ -42,7 +41,7 @@ impl LimitAlgorithm for Aimd {
                     old_limit
                 }
             }
-            Overload => {
+            Outcome::Overload => {
                 let limit = old_limit as f32 * self.dec;
 
                 // Floor instead of round, so the limit reduces even with small numbers.
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index b02ce472c0..c78ee166f1 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -98,7 +98,7 @@ impl ConnectionWithCredentialsProvider {
         info!("Establishing a new connection...");
         self.con = None;
         if let Some(f) = self.refresh_token_task.take() {
-            f.abort()
+            f.abort();
         }
         let mut con = self
             .get_client()
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index efd7437d5d..ad69246443 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -108,7 +108,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
     async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
-        use Notification::*;
         let payload: String = msg.get_payload()?;
         tracing::debug!(?payload, "received a message payload");
 
@@ -124,7 +123,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
         };
         tracing::debug!(?msg, "received a message");
         match msg {
-            Cancel(cancel_session) => {
+            Notification::Cancel(cancel_session) => {
                 tracing::Span::current().record(
                     "session_id",
                     tracing::field::display(cancel_session.session_id),
@@ -153,12 +152,12 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             }
             _ => {
                 invalidate_cache(self.cache.clone(), msg.clone());
-                if matches!(msg, AllowedIpsUpdate { .. }) {
+                if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                     Metrics::get()
                         .proxy
                         .redis_events_count
                         .inc(RedisEventsCount::AllowedIpsUpdate);
-                } else if matches!(msg, PasswordUpdate { .. }) {
+                } else if matches!(msg, Notification::PasswordUpdate { .. }) {
                     Metrics::get()
                         .proxy
                         .redis_events_count
@@ -180,16 +179,16 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
 }
 
 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
-    use Notification::*;
     match msg {
-        AllowedIpsUpdate { allowed_ips_update } => {
-            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
+        Notification::AllowedIpsUpdate { allowed_ips_update } => {
+            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
         }
-        PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
-            password_update.project_id,
-            password_update.role_name,
-        ),
-        Cancel(_) => unreachable!("cancel message should be handled separately"),
+        Notification::PasswordUpdate { password_update } => cache
+            .invalidate_role_secret_for_project(
+                password_update.project_id,
+                password_update.role_name,
+            ),
+        Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
     }
 }
 
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 0811416ca2..60207fc824 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -42,10 +42,9 @@ pub enum Error {
 
 impl UserFacingError for Error {
     fn to_string_client(&self) -> String {
-        use Error::*;
         match self {
-            ChannelBindingFailed(m) => m.to_string(),
-            ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
+            Self::ChannelBindingFailed(m) => (*m).to_string(),
+            Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
             _ => "authentication protocol violation".to_string(),
         }
     }
diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs
index 13d681de6d..6e2d3057ce 100644
--- a/proxy/src/sasl/channel_binding.rs
+++ b/proxy/src/sasl/channel_binding.rs
@@ -13,11 +13,10 @@ pub enum ChannelBinding<T> {
 
 impl<T> ChannelBinding<T> {
     pub fn and_then<R, E>(self, f: impl FnOnce(T) -> Result<R, E>) -> Result<ChannelBinding<R>, E> {
-        use ChannelBinding::*;
         Ok(match self {
-            NotSupportedClient => NotSupportedClient,
-            NotSupportedServer => NotSupportedServer,
-            Required(x) => Required(f(x)?),
+            Self::NotSupportedClient => ChannelBinding::NotSupportedClient,
+            Self::NotSupportedServer => ChannelBinding::NotSupportedServer,
+            Self::Required(x) => ChannelBinding::Required(f(x)?),
         })
     }
 }
@@ -25,11 +24,10 @@ impl<T> ChannelBinding<T> {
 impl<'a> ChannelBinding<&'a str> {
     // NB: FromStr doesn't work with lifetimes
     pub fn parse(input: &'a str) -> Option<Self> {
-        use ChannelBinding::*;
         Some(match input {
-            "n" => NotSupportedClient,
-            "y" => NotSupportedServer,
-            other => Required(other.strip_prefix("p=")?),
+            "n" => Self::NotSupportedClient,
+            "y" => Self::NotSupportedServer,
+            other => Self::Required(other.strip_prefix("p=")?),
         })
     }
 }
@@ -40,17 +38,16 @@ impl<T: std::fmt::Display> ChannelBinding<T> {
         &self,
         get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
     ) -> Result<std::borrow::Cow<'static, str>, E> {
-        use ChannelBinding::*;
         Ok(match self {
-            NotSupportedClient => {
+            Self::NotSupportedClient => {
                 // base64::encode("n,,")
                 "biws".into()
             }
-            NotSupportedServer => {
+            Self::NotSupportedServer => {
                 // base64::encode("y,,")
                 "eSws".into()
             }
-            Required(mode) => {
+            Self::Required(mode) => {
                 use std::io::Write;
                 let mut cbind_input = vec![];
                 write!(&mut cbind_input, "p={mode},,",).unwrap();
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index b9208f6f1f..2b5ae1785d 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -42,10 +42,9 @@ pub(super) enum ServerMessage<T> {
 
 impl<'a> ServerMessage<&'a str> {
     pub(super) fn to_reply(&self) -> BeMessage<'a> {
-        use BeAuthenticationSaslMessage::*;
         BeMessage::AuthenticationSasl(match self {
-            ServerMessage::Continue(s) => Continue(s.as_bytes()),
-            ServerMessage::Final(s) => Final(s.as_bytes()),
+            ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()),
+            ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()),
         })
     }
 }
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 862facb4e5..145e727a74 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -137,12 +137,12 @@ mod tests {
 
     #[tokio::test]
     async fn round_trip() {
-        run_round_trip_test("pencil", "pencil").await
+        run_round_trip_test("pencil", "pencil").await;
     }
 
     #[tokio::test]
     #[should_panic(expected = "password doesn't match")]
     async fn failure() {
-        run_round_trip_test("pencil", "eraser").await
+        run_round_trip_test("pencil", "eraser").await;
     }
 }
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index e8e7ef5c86..944bb3c83e 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -98,8 +98,6 @@ mod tests {
         // q% of counts will be within p of the actual value
         let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
 
-        dbg!(sketch.buckets.len());
-
         // insert a bunch of entries in a random order
         let mut ids2 = ids.clone();
         while !ids2.is_empty() {
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index d0adbc780e..f2494379a5 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -210,23 +210,23 @@ impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
     fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
-        use {sasl::Step::*, ExchangeState::*};
+        use {sasl::Step, ExchangeState};
         match &self.state {
-            Initial(init) => {
+            ExchangeState::Initial(init) => {
                 match init.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Continue(sent, msg) => {
-                        self.state = SaltSent(sent);
-                        Ok(Continue(self, msg))
+                    Step::Continue(sent, msg) => {
+                        self.state = ExchangeState::SaltSent(sent);
+                        Ok(Step::Continue(self, msg))
                     }
-                    Success(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
-            SaltSent(sent) => {
+            ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Success(keys, msg) => Ok(Success(keys, msg)),
-                    Continue(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
+                    Step::Continue(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
         }
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index cf677a3334..5ecbbf7004 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -59,7 +59,7 @@ impl<'a> ClientFirstMessage<'a> {
 
         // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
         if !username.is_empty() {
-            tracing::warn!(username, "scram username provided, but is not expected")
+            tracing::warn!(username, "scram username provided, but is not expected");
             // TODO(conrad):
             // return None;
         }
@@ -137,7 +137,7 @@ impl<'a> ClientFinalMessage<'a> {
     /// Build a response to [`ClientFinalMessage`].
     pub fn build_server_final_message(
         &self,
-        signature_builder: SignatureBuilder,
+        signature_builder: SignatureBuilder<'_>,
         server_key: &ScramKey,
     ) -> String {
         let mut buf = String::from("v=");
@@ -212,7 +212,7 @@ mod tests {
 
     #[test]
     fn parse_client_first_message_with_invalid_gs2_authz() {
-        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
+        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none());
     }
 
     #[test]
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index a803ba7e1b..f690cc7738 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -84,6 +84,6 @@ mod tests {
         };
 
         let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
-        assert_eq!(hash, expected)
+        assert_eq!(hash, expected);
     }
 }
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 7701b869a3..fa3d3ccca2 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -270,7 +270,7 @@ fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
                         .inc(ThreadPoolWorkerId(index));
 
                     // skip for now
-                    worker.push(job)
+                    worker.push(job);
                 }
             }
 
@@ -316,6 +316,6 @@ mod tests {
             10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
             178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
         ];
-        assert_eq!(actual, expected)
+        assert_eq!(actual, expected);
     }
 }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 115bef7375..5416d63b5b 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -120,7 +120,7 @@ pub async fn task_main(
             tracing::trace!("attempting to cancel a random connection");
             if let Some(token) = config.http_config.cancel_set.take() {
                 tracing::debug!("cancelling a random connection");
-                token.cancel()
+                token.cancel();
             }
         }
 
@@ -198,7 +198,7 @@ async fn connection_startup(
     let peer_addr = peer.unwrap_or(peer_addr).ip();
     let has_private_peer_addr = match peer_addr {
         IpAddr::V4(ip) => ip.is_private(),
-        _ => false,
+        IpAddr::V6(_) => false,
     };
     info!(?session_id, %peer_addr, "accepted new TCP connection");
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index e1dc44dc1c..9ede659cc4 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -390,7 +390,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             .write()
             .get_conn_entry(conn_info.db_and_user())
         {
-            client = Some(entry.conn)
+            client = Some(entry.conn);
         }
         let endpoint_pool = Arc::downgrade(&endpoint_pool);
 
@@ -662,13 +662,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
         }
     }
     pub fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
         }
     }
 }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 690e92ffb1..7809d2e574 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -234,7 +234,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
                 .await
                 .inspect_err(|_| {
                     if record_handshake_error {
-                        Metrics::get().proxy.tls_handshake_failures.inc()
+                        Metrics::get().proxy.tls_handshake_failures.inc();
                     }
                 })?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 92c64bb8ad..202fe8de1f 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -12,7 +12,7 @@ impl ApiUrl {
     }
 
     /// See [`url::Url::path_segments_mut`].
-    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut {
+    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
         // We've already verified that it works during construction.
         self.0.path_segments_mut().expect("bad API url")
     }
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 888ad38048..3bd8f4c8ef 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -36,7 +36,7 @@ impl<T> Default for Waiters<T> {
 }
 
 impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> Result<Waiter<T>, RegisterError> {
+    pub fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
         let (tx, rx) = oneshot::channel();
 
         self.0

From eb7241c798d445cd7bcb52d14fbf6c59f4a54d32 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 19 Aug 2024 16:35:34 +0200
Subject: [PATCH 20/44] l0_flush: remove support for mode `page-cached` (#8739)

It's been rolled out everywhere, no configs are referencing it.

All code that's made dead by the removal of the config option is removed
as part of this PR.

The `page_caching::PreWarmingWriter` in `::No` mode is equivalent to a
`size_tracking_writer`, so, use that.

part of https://github.com/neondatabase/neon/issues/7418
---
 pageserver/src/l0_flush.rs                    |  19 +-
 pageserver/src/tenant/ephemeral_file.rs       |   5 +-
 .../src/tenant/ephemeral_file/page_caching.rs | 169 ++----------------
 .../tenant/storage_layer/inmemory_layer.rs    |  68 +------
 4 files changed, 20 insertions(+), 241 deletions(-)

diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 10187f2ba3..313a7961a6 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,15 +1,10 @@
 use std::{num::NonZeroUsize, sync::Arc};
 
-use crate::tenant::ephemeral_file;
-
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    PageCached,
     #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
+    Direct { max_concurrency: NonZeroUsize },
 }
 
 impl Default for L0FlushConfig {
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);
 
 pub enum Inner {
-    PageCached,
     Direct { semaphore: tokio::sync::Semaphore },
 }
 
 impl L0FlushGlobalState {
     pub fn new(config: L0FlushConfig) -> Self {
         match config {
-            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
             L0FlushConfig::Direct { max_concurrency } => {
                 let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                 Self(Arc::new(Inner::Direct { semaphore }))
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
         &self.0
     }
 }
-
-impl L0FlushConfig {
-    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
-        use L0FlushConfig::*;
-        match self {
-            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
-            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
-        }
-    }
-}
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 770f3ca5f0..3eb8384d05 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
 }
 
 mod page_caching;
-pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;
 
 impl EphemeralFile {
@@ -52,12 +51,10 @@ impl EphemeralFile {
         )
         .await?;
 
-        let prewarm = conf.l0_flush.prewarm_on_write();
-
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, gate_guard),
         })
     }
 
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 7355b3b5a3..48926354f1 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,15 +1,15 @@
 //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
 //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+//!
+//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
 
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::VirtualFile;
 
-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use std::ops::{Deref, Range};
+use std::io::{self};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
 
@@ -18,33 +18,17 @@ use super::zero_padded_read_write;
 /// See module-level comment.
 pub struct RW {
     page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
     /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
     _gate_guard: utils::sync::gate::GateGuard,
 }
 
-/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
-/// should we pre-warm the [`crate::page_cache`] with the contents?
-#[derive(Clone, Copy)]
-pub enum PrewarmOnWrite {
-    Yes,
-    No,
-}
-
 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-                prewarm_on_write,
-            )),
+            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
             _gate_guard,
         }
     }
@@ -84,10 +68,10 @@ impl RW {
         let vec = Vec::with_capacity(size);
 
         // read from disk what we've already flushed
-        let writer = self.rw.as_writer();
-        let flushed_range = writer.written_range();
-        let mut vec = writer
-            .file
+        let file_size_tracking_writer = self.rw.as_writer();
+        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
+        let mut vec = file_size_tracking_writer
+            .as_inner()
             .read_exact_at(
                 vec.slice(0..(flushed_range.end - flushed_range.start)),
                 u64::try_from(flushed_range.start).unwrap(),
@@ -122,7 +106,7 @@ impl RW {
                             format!(
                                 "ephemeral file: read immutable page #{}: {}: {:#}",
                                 blknum,
-                                self.rw.as_writer().file.path,
+                                self.rw.as_writer().as_inner().path,
                                 e,
                             ),
                         )
@@ -132,7 +116,7 @@ impl RW {
                     }
                     page_cache::ReadBufResult::NotFound(write_guard) => {
                         let write_guard = writer
-                            .file
+                            .as_inner()
                             .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                             .await?;
                         let read_guard = write_guard.mark_valid();
@@ -154,137 +138,16 @@ impl Drop for RW {
 
         // unlink the file
         // we are clear to do this, because we have entered a gate
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        let path = &self.rw.as_writer().as_inner().path;
+        let res = std::fs::remove_file(path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
                 // just never log the not found errors, we cannot do anything for them; on detach
                 // the tenant directory is already gone.
                 //
                 // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
+                error!("could not remove ephemeral file '{path}': {e}");
             }
         }
     }
 }
-
-struct PreWarmingWriter {
-    prewarm_on_write: PrewarmOnWrite,
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(
-        page_cache_file_id: page_cache::FileId,
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-    ) -> Self {
-        Self {
-            prewarm_on_write,
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-
-    /// Return the byte range within `file` that has been written though `write_all`.
-    ///
-    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
-    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
-        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
-        struct Wrapper(Range<usize>);
-        impl Deref for Wrapper {
-            type Target = Range<usize>;
-            fn deref(&self) -> &Range<usize> {
-                &self.0
-            }
-        }
-        Wrapper(0..nwritten_blocks * PAGE_SZ)
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let buf = match self.file.write_all(buf, ctx).await {
-            (buf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                buf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-
-        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            // Pre-warm page cache with the contents.
-            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-            // benefits the code that writes InMemoryLayer=>L0 layers.
-
-            let cache = page_cache::get();
-            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-                RequestContext::new(
-                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                    crate::context::DownloadBehavior::Error,
-                )
-            });
-            for blknum_in_buffer in 0..nblocks {
-                let blk_in_buffer =
-                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-                let blknum = self
-                    .nwritten_blocks
-                    .checked_add(blknum_in_buffer as u32)
-                    .unwrap();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    .await
-                {
-                    Err(e) => {
-                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                    }
-                    Ok(v) => match v {
-                        page_cache::ReadBufResult::Found(_guard) => {
-                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                        }
-                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                            write_guard.copy_from_slice(blk_in_buffer);
-                            let _ = write_guard.mark_valid();
-                        }
-                    },
-                }
-            }
-        }
-
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf))
-    }
-}
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 748d79c149..130d1002a0 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache, walrecord};
+use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -249,9 +249,7 @@ impl InMemoryLayer {
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().await;
-
+    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
         let end_str = self.end_lsn_or_max();
 
         println!(
@@ -259,39 +257,6 @@ impl InMemoryLayer {
             self.timeline_id, self.start_lsn, end_str,
         );
 
-        if !verbose {
-            return Ok(());
-        }
-
-        let cursor = inner.file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, vec_map) in inner.index.iter() {
-            for (lsn, pos) in vec_map.as_slice() {
-                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                let val = Value::des(&buf);
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
-                    }
-                }
-                println!("  key {} at {}: {}", key, lsn, desc);
-            }
-        }
-
         Ok(())
     }
 
@@ -536,7 +501,6 @@ impl InMemoryLayer {
 
         use l0_flush::Inner;
         let _concurrency_permit = match l0_flush_global_state {
-            Inner::PageCached => None,
             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
         };
 
@@ -568,34 +532,6 @@ impl InMemoryLayer {
         .await?;
 
         match l0_flush_global_state {
-            l0_flush::Inner::PageCached => {
-                let ctx = RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::InMemoryLayer)
-                    .build();
-
-                let mut buf = Vec::new();
-
-                let cursor = inner.file.block_cursor();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
-                            .put_value_bytes(
-                                Key::from_compact(*key),
-                                *lsn,
-                                buf.slice_len(),
-                                will_init,
-                                &ctx,
-                            )
-                            .await;
-                        res?;
-                        buf = tmp.into_raw_slice().into_inner();
-                    }
-                }
-            }
             l0_flush::Inner::Direct { .. } => {
                 let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                 assert_eq!(

From 3b8ca477ab6852143f8acb5b8217e5f24e9e8605 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 16:39:44 +0200
Subject: [PATCH 21/44] Migrate physical GC and scan_metadata to remote_storage
 (#8673)

Migrates most of the remaining parts of the scrubber to remote_storage:

* `pageserver_physical_gc`
* `scan_metadata` for pageservers (safekeepers were done in #8595)
* `download()` in `tenant_snapshot`. The main `tenant_snapshot` is not
migrated as it uses version history to be able to work in the face of
ongoing changes.

Part of #7547
---
 libs/remote_storage/src/azure_blob.rs         |  42 +++++
 libs/remote_storage/src/lib.rs                |  24 ++-
 libs/remote_storage/src/local_fs.rs           |  14 ++
 libs/remote_storage/src/metrics.rs            |   9 +-
 libs/remote_storage/src/s3_bucket.rs          |  74 +++++++-
 libs/remote_storage/src/simulate_failures.rs  |  11 ++
 storage_scrubber/src/checks.rs                |  85 ++++-----
 storage_scrubber/src/lib.rs                   |  69 ++++---
 storage_scrubber/src/metadata_stream.rs       | 170 ++++--------------
 .../src/pageserver_physical_gc.rs             | 134 +++++---------
 .../src/scan_pageserver_metadata.rs           |  40 ++---
 .../src/scan_safekeeper_metadata.rs           |   7 +-
 storage_scrubber/src/tenant_snapshot.rs       |  27 +--
 test_runner/fixtures/neon_fixtures.py         |   1 +
 14 files changed, 366 insertions(+), 341 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 3c77d5a227..cb7479f6cd 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
+        let properties_future = blob_client.get_properties().into_future();
+
+        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+
+        let res = tokio::select! {
+            res = properties_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        let data = match res {
+            Ok(Ok(data)) => Ok(data),
+            Ok(Err(sdk)) => Err(to_download_error(sdk)),
+            Err(_timeout) => Err(DownloadError::Timeout),
+        }?;
+
+        let properties = data.blob.properties;
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::from(properties.last_modified),
+            size: properties.content_length,
+        })
+    }
+
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 2c9e298f79..cc1d3e0ae4 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -150,7 +150,7 @@ pub enum ListingMode {
     NoDelimiter,
 }
 
-#[derive(PartialEq, Eq, Debug)]
+#[derive(PartialEq, Eq, Debug, Clone)]
 pub struct ListingObject {
     pub key: RemotePath,
     pub last_modified: SystemTime,
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
         Ok(combined)
     }
 
+    /// Obtain metadata information about an object.
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError>;
+
     /// Streams the local file contents into remote into the remote storage entry.
     ///
     /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    // See [`RemoteStorage::head_object`].
+    pub async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.head_object(key, cancel).await,
+            Self::AwsS3(s) => s.head_object(key, cancel).await,
+            Self::AzureBlob(s) => s.head_object(key, cancel).await,
+            Self::Unreliable(s) => s.head_object(key, cancel).await,
+        }
+    }
+
     /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
             RequestKind::Delete => &self.write,
             RequestKind::Copy => &self.write,
             RequestKind::TimeTravel => &self.write,
+            RequestKind::Head => &self.read,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 99b4aa4061..c3ef18cab1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let target_file_path = key.with_base(&self.storage_root);
+        let metadata = file_metadata(&target_file_path).await?;
+        Ok(ListingObject {
+            key: key.clone(),
+            last_modified: metadata.modified()?,
+            size: metadata.len(),
+        })
+    }
+
     async fn upload(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs
index bbb51590f3..f1aa4c433b 100644
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
     List = 3,
     Copy = 4,
     TimeTravel = 5,
+    Head = 6,
 }
 
 use scopeguard::ScopeGuard;
@@ -27,6 +28,7 @@ impl RequestKind {
             List => "list_objects",
             Copy => "copy_object",
             TimeTravel => "time_travel_recover",
+            Head => "head_object",
         }
     }
     const fn as_index(&self) -> usize {
@@ -34,7 +36,8 @@ impl RequestKind {
     }
 }
 
-pub(crate) struct RequestTyped<C>([C; 6]);
+const REQUEST_KIND_COUNT: usize = 7;
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
 
 impl<C> RequestTyped<C> {
     pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
-        let arr = std::array::from_fn::<C, 6, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 1f25da813d..11f6598cbf 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -23,7 +23,7 @@ use aws_config::{
 use aws_sdk_s3::{
     config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
-    operation::get_object::GetObjectError,
+    operation::{get_object::GetObjectError, head_object::HeadObjectError},
     types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
     Client,
 };
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let head_future = self
+            .client
+            .head_object()
+            .bucket(self.bucket_name())
+            .key(self.relative_path_to_s3_object(key))
+            .send();
+
+        let head_future = tokio::time::timeout(self.timeout, head_future);
+
+        let res = tokio::select! {
+            res = head_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let res = res.map_err(|_e| DownloadError::Timeout)?;
+
+        // do not incl. timeouts as errors in metrics but cancellations
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        let data = match res {
+            Ok(object_output) => object_output,
+            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
+                return Err(DownloadError::NotFound);
+            }
+            Err(e) => {
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                return Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("s3 head object"),
+                ));
+            }
+        };
+
+        let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
+            return Err(DownloadError::Other(anyhow!(
+                "head_object doesn't contain last_modified or content_length"
+            )))?;
+        };
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::try_from(last_modified).map_err(|e| {
+                DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
+            })?,
+            size: size as u64,
+        })
+    }
+
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 13f873dcdb..c7eb634af3 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
     ListPrefixes(Option<RemotePath>),
+    HeadObject(RemotePath),
     Upload(RemotePath),
     Download(RemotePath),
     Delete(RemotePath),
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
         self.inner.list(prefix, mode, max_keys, cancel).await
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<crate::ListingObject, DownloadError> {
+        self.attempt(RemoteOp::HeadObject(key.clone()))
+            .map_err(DownloadError::Other)?;
+        self.inner.head_object(key, cancel).await
+    }
+
     async fn upload(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 35ec69fd50..9063b3c197 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,22 +1,22 @@
 use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
-use aws_sdk_s3::Client;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::stream_listing_generic;
+use crate::{download_object_with_retries_generic, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
 
 pub(crate) struct TimelineAnalysis {
     /// Anomalies detected
@@ -48,13 +48,12 @@ impl TimelineAnalysis {
 }
 
 pub(crate) async fn branch_cleanup_and_check_errors(
-    s3_client: &Client,
-    target: &RootTarget,
+    remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
     tenant_objects: &mut TenantObjectListing,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
-    s3_data: Option<S3TimelineBlobData>,
+    s3_data: Option<RemoteTimelineBlobData>,
 ) -> TimelineAnalysis {
     let mut result = TimelineAnalysis::new();
 
@@ -78,7 +77,9 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
     match s3_data {
         Some(s3_data) => {
-            result.garbage_keys.extend(s3_data.unknown_keys);
+            result
+                .garbage_keys
+                .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
 
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
@@ -143,11 +144,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
                             // HEAD request used here to address a race condition  when an index was uploaded concurrently
                             // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
-                            let response = s3_client
-                                .head_object()
-                                .bucket(target.bucket_name())
-                                .key(path.get_path().as_str())
-                                .send()
+                            let response = remote_client
+                                .head_object(&path, &CancellationToken::new())
                                 .await;
 
                             if response.is_err() {
@@ -284,14 +282,14 @@ impl TenantObjectListing {
 }
 
 #[derive(Debug)]
-pub(crate) struct S3TimelineBlobData {
+pub(crate) struct RemoteTimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
 
     // Index objects that were not used when loading `blob_data`, e.g. those from old generations
-    pub(crate) unused_index_keys: Vec<String>,
+    pub(crate) unused_index_keys: Vec<ListingObject>,
 
     // Objects whose keys were not recognized at all, i.e. not layer files, not indices
-    pub(crate) unknown_keys: Vec<String>,
+    pub(crate) unknown_keys: Vec<ListingObject>,
 }
 
 #[derive(Debug)]
@@ -322,11 +320,11 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
-pub(crate) async fn list_timeline_blobs(
-    s3_client: &Client,
+pub(crate) async fn list_timeline_blobs_generic(
+    remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
     s3_root: &RootTarget,
-) -> anyhow::Result<S3TimelineBlobData> {
+) -> anyhow::Result<RemoteTimelineBlobData> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
@@ -335,19 +333,25 @@ pub(crate) async fn list_timeline_blobs(
     let mut timeline_dir_target = s3_root.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
-    let mut index_part_keys: Vec<String> = Vec::new();
+    let mut index_part_keys: Vec<ListingObject> = Vec::new();
     let mut initdb_archive: bool = false;
 
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+    let prefix_str = &timeline_dir_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
+    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let (key, Some(obj)) = obj? else {
+            panic!("ListingObject not specified");
+        };
+
+        let blob_name = key.get_path().as_str().strip_prefix(prefix_str);
         match blob_name {
             Some(name) if name.starts_with("index_part.json") => {
                 tracing::debug!("Index key {key}");
-                index_part_keys.push(key.to_owned())
+                index_part_keys.push(obj)
             }
             Some("initdb.tar.zst") => {
                 tracing::debug!("initdb archive {key}");
@@ -358,7 +362,7 @@ pub(crate) async fn list_timeline_blobs(
             }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                 Ok((new_layer, gen)) => {
-                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
+                    tracing::debug!("Parsed layer key: {new_layer} {gen:?}");
                     s3_layers.insert((new_layer, gen));
                 }
                 Err(e) => {
@@ -366,13 +370,13 @@ pub(crate) async fn list_timeline_blobs(
                     errors.push(
                         format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                     );
-                    unknown_keys.push(key.to_string());
+                    unknown_keys.push(obj);
                 }
             },
             None => {
-                tracing::warn!("Unknown key {}", key);
+                tracing::warn!("Unknown key {key}");
                 errors.push(format!("S3 list response got an object with odd key {key}"));
-                unknown_keys.push(key.to_string());
+                unknown_keys.push(obj);
             }
         }
     }
@@ -381,7 +385,7 @@ pub(crate) async fn list_timeline_blobs(
         tracing::debug!(
             "Timeline is empty apart from initdb archive: expected post-deletion state."
         );
-        return Ok(S3TimelineBlobData {
+        return Ok(RemoteTimelineBlobData {
             blob_data: BlobDataParseResult::Relic,
             unused_index_keys: index_part_keys,
             unknown_keys: Vec::new(),
@@ -395,13 +399,13 @@ pub(crate) async fn list_timeline_blobs(
             // Stripping the index key to the last part, because RemotePath doesn't
             // like absolute paths, and depending on prefix_in_bucket it's possible
             // for the keys we read back to start with a slash.
-            let basename = key.rsplit_once('/').unwrap().1;
+            let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1;
             parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
         })
         .max_by_key(|i| i.1)
         .map(|(k, g)| (k.clone(), g))
     {
-        Some((key, gen)) => (Some(key), gen),
+        Some((key, gen)) => (Some::<ListingObject>(key.to_owned()), gen),
         None => {
             // Legacy/missing case: one or zero index parts, which did not have a generation
             (index_part_keys.pop(), Generation::none())
@@ -416,17 +420,14 @@ pub(crate) async fn list_timeline_blobs(
     }
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
-        let index_part_bytes = download_object_with_retries(
-            s3_client,
-            &timeline_dir_target.bucket_name,
-            index_part_object_key,
-        )
-        .await
-        .context("index_part.json download")?;
+        let index_part_bytes =
+            download_object_with_retries_generic(remote_client, &index_part_object_key.key)
+                .await
+                .context("index_part.json download")?;
 
         match serde_json::from_slice(&index_part_bytes) {
             Ok(index_part) => {
-                return Ok(S3TimelineBlobData {
+                return Ok(RemoteTimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
                         index_part: Box::new(index_part),
                         index_part_generation,
@@ -448,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
         );
     }
 
-    Ok(S3TimelineBlobData {
+    Ok(RemoteTimelineBlobData {
         blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 1fc94cc174..3183bc3c64 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -452,23 +452,26 @@ fn stream_objects_with_retries<'a>(
         let mut list_stream =
             storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
         while let Some(res) = list_stream.next().await {
-            if let Err(err) = res {
-                let yield_err = if err.is_permanent() {
-                    true
-                } else {
-                    let backoff_time = 1 << trial.max(5);
-                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-                    trial += 1;
-                    trial == MAX_RETRIES - 1
-                };
-                if yield_err {
-                    yield Err(err)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                    break;
+            match res {
+                Err(err) => {
+                    let yield_err = if err.is_permanent() {
+                        true
+                    } else {
+                        let backoff_time = 1 << trial.max(5);
+                        tokio::time::sleep(Duration::from_secs(backoff_time)).await;
+                        trial += 1;
+                        trial == MAX_RETRIES - 1
+                    };
+                    if yield_err {
+                        yield Err(err)
+                            .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                        break;
+                    }
+                }
+                Ok(res) => {
+                    trial = 0;
+                    yield Ok(res);
                 }
-            } else {
-                trial = 0;
-                yield res.map_err(anyhow::Error::from);
             }
         }
     }
@@ -513,41 +516,35 @@ async fn list_objects_with_retries_generic(
     panic!("MAX_RETRIES is not allowed to be 0");
 }
 
-async fn download_object_with_retries(
-    s3_client: &Client,
-    bucket_name: &str,
-    key: &str,
+async fn download_object_with_retries_generic(
+    remote_client: &GenericRemoteStorage,
+    key: &RemotePath,
 ) -> anyhow::Result<Vec<u8>> {
-    for _ in 0..MAX_RETRIES {
-        let mut body_buf = Vec::new();
-        let response_stream = match s3_client
-            .get_object()
-            .bucket(bucket_name)
-            .key(key)
-            .send()
-            .await
-        {
+    let cancel = CancellationToken::new();
+    for trial in 0..MAX_RETRIES {
+        let mut buf = Vec::new();
+        let download = match remote_client.download(key, &cancel).await {
             Ok(response) => response,
             Err(e) => {
                 error!("Failed to download object for key {key}: {e}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                 continue;
             }
         };
 
-        match response_stream
-            .body
-            .into_async_read()
-            .read_to_end(&mut body_buf)
+        match tokio_util::io::StreamReader::new(download.download_stream)
+            .read_to_end(&mut buf)
             .await
         {
             Ok(bytes_read) => {
                 tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
-                return Ok(body_buf);
+                return Ok(buf);
             }
             Err(e) => {
                 error!("Failed to stream object body for key {key}: {e}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }
     }
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 54812ffc94..eca774413a 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -2,14 +2,14 @@ use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
-use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use aws_sdk_s3::Client;
 use futures::StreamExt;
 use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
-    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
-    TenantShardTimelineId,
+    list_objects_with_retries, list_objects_with_retries_generic, stream_objects_with_retries,
+    RootTarget, S3Target, TenantShardTimelineId,
 };
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
@@ -75,53 +75,38 @@ pub fn stream_tenants<'a>(
 }
 
 pub async fn stream_tenant_shards<'a>(
-    s3_client: &'a Client,
+    remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
     tenant_id: TenantId,
 ) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
-    let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
-    let mut continuation_token = None;
     let shards_target = target.tenant_shards_prefix(&tenant_id);
 
-    loop {
-        tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
-        let fetch_response =
-            list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
-        let fetch_response = match fetch_response {
-            Err(e) => {
-                tenant_shard_ids.push(Err(e));
-                break;
-            }
-            Ok(r) => r,
-        };
+    let strip_prefix = target.tenants_root().prefix_in_bucket;
+    let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix);
 
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&target.tenants_root().prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                let first_part = entry_id_str.split('/').next().unwrap();
+    tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket);
+    let listing = list_objects_with_retries_generic(
+        remote_client,
+        ListingMode::WithDelimiter,
+        &shards_target,
+    )
+    .await?;
 
-                first_part
-                    .parse::<TenantShardId>()
-                    .with_context(|| format!("Incorrect entry id str: {first_part}"))
-            });
+    let tenant_shard_ids = listing
+        .prefixes
+        .iter()
+        .map(|prefix| prefix.get_path().as_str())
+        .filter_map(|prefix| -> Option<&str> { prefix.strip_prefix(prefix_str) })
+        .map(|entry_id_str| {
+            let first_part = entry_id_str.split('/').next().unwrap();
 
-        for i in new_entry_ids {
-            tenant_shard_ids.push(i);
-        }
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
+            first_part
+                .parse::<TenantShardId>()
+                .with_context(|| format!("Incorrect entry id str: {first_part}"))
+        })
+        .collect::<Vec<_>>();
 
+    tracing::debug!("Yielding {} shards for {tenant_id}", tenant_shard_ids.len());
     Ok(stream! {
         for i in tenant_shard_ids {
             let id = i?;
@@ -130,65 +115,6 @@ pub async fn stream_tenant_shards<'a>(
     })
 }
 
-/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
-/// using ListObjectsv2.  The listing is done before the stream is built, so that this
-/// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines<'a>(
-    s3_client: &'a Client,
-    target: &'a RootTarget,
-    tenant: TenantShardId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
-    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
-    let mut continuation_token = None;
-    let timelines_target = target.timelines_root(&tenant);
-
-    loop {
-        tracing::debug!("Listing in {}", tenant);
-        let fetch_response =
-            list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
-                .await;
-        let fetch_response = match fetch_response {
-            Err(e) => {
-                timeline_ids.push(Err(e));
-                break;
-            }
-            Ok(r) => r,
-        };
-
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                entry_id_str
-                    .parse::<TimelineId>()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-        for i in new_entry_ids {
-            timeline_ids.push(i);
-        }
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
-
-    tracing::debug!("Yielding for {}", tenant);
-    Ok(stream! {
-        for i in timeline_ids {
-            let id = i?;
-            yield Ok(TenantShardTimelineId::new(tenant, id));
-        }
-    })
-}
-
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
 /// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
@@ -200,6 +126,11 @@ pub async fn stream_tenant_timelines_generic<'a>(
     let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
     let timelines_target = target.timelines_root(&tenant);
 
+    let prefix_str = &timelines_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timelines_target.prefix_in_bucket);
+
     let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
         remote_client,
         ListingMode::WithDelimiter,
@@ -220,11 +151,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
             .prefixes
             .iter()
             .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .get_path()
-                    .as_str()
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
+                prefix.get_path().as_str().strip_prefix(prefix_str)
             })
             .map(|entry_id_str| {
                 entry_id_str
@@ -237,7 +164,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
         }
     }
 
-    tracing::debug!("Yielding for {}", tenant);
+    tracing::debug!("Yielding {} timelines for {}", timeline_ids.len(), tenant);
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;
@@ -246,37 +173,6 @@ pub async fn stream_tenant_timelines_generic<'a>(
     })
 }
 
-pub(crate) fn stream_listing<'a>(
-    s3_client: &'a Client,
-    target: &'a S3Target,
-) -> impl Stream<Item = anyhow::Result<ObjectIdentifier>> + 'a {
-    try_stream! {
-        let mut continuation_token = None;
-        loop {
-            let fetch_response =
-                list_objects_with_retries(s3_client, target, continuation_token.clone()).await?;
-
-            if target.delimiter.is_empty() {
-                for object_key in fetch_response.contents().iter().filter_map(|object| object.key())
-                {
-                    let object_id = ObjectIdentifier::builder().key(object_key).build()?;
-                    yield object_id;
-                }
-            } else {
-                for prefix in fetch_response.common_prefixes().iter().filter_map(|p| p.prefix()) {
-                    let object_id = ObjectIdentifier::builder().key(prefix).build()?;
-                    yield object_id;
-                }
-            }
-
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-    }
-}
-
 pub(crate) fn stream_listing_generic<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a S3Target,
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 20d9bd6dd4..6828081128 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,11 +1,10 @@
 use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
-use std::time::{Duration, SystemTime};
+use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult};
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
-use aws_sdk_s3::Client;
+use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
+use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -13,10 +12,11 @@ use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::controller_api::TenantDescribeResponse;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
 use reqwest::Method;
 use serde::Serialize;
 use storage_controller_client::control_api;
+use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
 use utils::generation::Generation;
 use utils::id::{TenantId, TenantTimelineId};
@@ -240,38 +240,13 @@ impl TenantRefAccumulator {
     }
 }
 
-async fn is_old_enough(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
-    min_age: &Duration,
-    key: &str,
-    summary: &mut GcSummary,
-) -> bool {
+fn is_old_enough(min_age: &Duration, key: &ListingObject, summary: &mut GcSummary) -> bool {
     // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
     // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
-    let age: Duration = match s3_client
-        .head_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(response) => match response.last_modified {
-            None => {
-                tracing::warn!("Missing last_modified");
-                summary.remote_storage_errors += 1;
-                return false;
-            }
-            Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
-                Ok(Ok(e)) => e,
-                Err(_) | Ok(Err(_)) => {
-                    tracing::warn!("Bad last_modified time: {last_modified:?}");
-                    return false;
-                }
-            },
-        },
-        Err(e) => {
-            tracing::warn!("Failed to HEAD {key}: {e}");
+    let age = match key.last_modified.elapsed() {
+        Ok(e) => e,
+        Err(_) => {
+            tracing::warn!("Bad last_modified time: {:?}", key.last_modified);
             summary.remote_storage_errors += 1;
             return false;
         }
@@ -289,17 +264,30 @@ async fn is_old_enough(
     old_enough
 }
 
+/// Same as [`is_old_enough`], but doesn't require a [`ListingObject`] passed to it.
+async fn check_is_old_enough(
+    remote_client: &GenericRemoteStorage,
+    key: &RemotePath,
+    min_age: &Duration,
+    summary: &mut GcSummary,
+) -> Option<bool> {
+    let listing_object = remote_client
+        .head_object(key, &CancellationToken::new())
+        .await
+        .ok()?;
+    Some(is_old_enough(min_age, &listing_object, summary))
+}
+
 async fn maybe_delete_index(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
+    remote_client: &GenericRemoteStorage,
     min_age: &Duration,
     latest_gen: Generation,
-    key: &str,
+    obj: &ListingObject,
     mode: GcMode,
     summary: &mut GcSummary,
 ) {
     // Validation: we will only delete things that parse cleanly
-    let basename = key.rsplit_once('/').unwrap().1;
+    let basename = obj.key.get_path().file_name().unwrap();
     let candidate_generation =
         match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) {
             Some(g) => g,
@@ -328,7 +316,7 @@ async fn maybe_delete_index(
         return;
     }
 
-    if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
+    if !is_old_enough(min_age, obj, summary) {
         return;
     }
 
@@ -338,11 +326,8 @@ async fn maybe_delete_index(
     }
 
     // All validations passed: erase the object
-    match s3_client
-        .delete_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
+    match remote_client
+        .delete(&obj.key, &CancellationToken::new())
         .await
     {
         Ok(_) => {
@@ -358,8 +343,7 @@ async fn maybe_delete_index(
 
 #[allow(clippy::too_many_arguments)]
 async fn gc_ancestor(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
+    remote_client: &GenericRemoteStorage,
     root_target: &RootTarget,
     min_age: &Duration,
     ancestor: TenantShardId,
@@ -368,7 +352,7 @@ async fn gc_ancestor(
     summary: &mut GcSummary,
 ) -> anyhow::Result<()> {
     // Scan timelines in the ancestor
-    let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
+    let timelines = stream_tenant_timelines_generic(remote_client, root_target, ancestor).await?;
     let mut timelines = std::pin::pin!(timelines);
 
     // Build a list of keys to retain
@@ -376,7 +360,7 @@ async fn gc_ancestor(
     while let Some(ttid) = timelines.next().await {
         let ttid = ttid?;
 
-        let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
+        let data = list_timeline_blobs_generic(remote_client, ttid, root_target).await?;
 
         let s3_layers = match data.blob_data {
             BlobDataParseResult::Parsed {
@@ -427,7 +411,8 @@ async fn gc_ancestor(
 
             // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
             // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
-            if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
+            let path = RemotePath::from_string(key.strip_prefix("/").unwrap_or(&key)).unwrap();
+            if check_is_old_enough(remote_client, &path, min_age, summary).await != Some(true) {
                 continue;
             }
 
@@ -437,13 +422,7 @@ async fn gc_ancestor(
             }
 
             // All validations passed: erase the object
-            match s3_client
-                .delete_object()
-                .bucket(&bucket_config.bucket)
-                .key(&key)
-                .send()
-                .await
-            {
+            match remote_client.delete(&path, &CancellationToken::new()).await {
                 Ok(_) => {
                     tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
                     summary.ancestor_layers_deleted += 1;
@@ -477,10 +456,11 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let (remote_client, target) =
+        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
@@ -493,14 +473,13 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
     async fn gc_timeline(
-        s3_client: &Client,
-        bucket_config: &BucketConfig,
+        remote_client: &GenericRemoteStorage,
         min_age: &Duration,
         target: &RootTarget,
         mode: GcMode,
@@ -508,7 +487,7 @@ pub async fn pageserver_physical_gc(
         accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
-        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
 
         let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
@@ -533,17 +512,9 @@ pub async fn pageserver_physical_gc(
         accumulator.lock().unwrap().update(ttid, index_part);
 
         for key in candidates {
-            maybe_delete_index(
-                s3_client,
-                bucket_config,
-                min_age,
-                latest_gen,
-                &key,
-                mode,
-                &mut summary,
-            )
-            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key))
-            .await;
+            maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
+                .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
+                .await;
         }
 
         Ok(summary)
@@ -554,15 +525,7 @@ pub async fn pageserver_physical_gc(
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
         let timelines = timelines.map_ok(|ttid| {
-            gc_timeline(
-                &s3_client,
-                bucket_config,
-                &min_age,
-                &target,
-                mode,
-                ttid,
-                &accumulator,
-            )
+            gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator)
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
@@ -586,8 +549,7 @@ pub async fn pageserver_physical_gc(
 
     for ancestor_shard in ancestor_shards {
         gc_ancestor(
-            &s3_client,
-            bucket_config,
+            &remote_client,
             &target,
             &min_age,
             ancestor_shard,
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index 2409b7b132..e89e97ccb6 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -1,16 +1,16 @@
 use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
-    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
-    TenantObjectListing, TimelineAnalysis,
+    branch_cleanup_and_check_errors, list_timeline_blobs_generic, BlobDataParseResult,
+    RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
 };
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
-use aws_sdk_s3::Client;
+use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
+use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;
@@ -36,7 +36,7 @@ impl MetadataSummary {
         Self::default()
     }
 
-    fn update_data(&mut self, data: &S3TimelineBlobData) {
+    fn update_data(&mut self, data: &RemoteTimelineBlobData) {
         self.timeline_shard_count += 1;
         if let BlobDataParseResult::Parsed {
             index_part,
@@ -120,10 +120,10 @@ pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
     };
@@ -133,20 +133,20 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
     async fn report_on_timeline(
-        s3_client: &Client,
+        remote_client: &GenericRemoteStorage,
         target: &RootTarget,
         ttid: TenantShardTimelineId,
-    ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
-        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+    ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
+        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
         Ok((ttid, data))
     }
-    let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
+    let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid));
     let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
@@ -157,12 +157,11 @@ pub async fn scan_pageserver_metadata(
     let mut tenant_timeline_results = Vec::new();
 
     async fn analyze_tenant(
-        s3_client: &Client,
-        target: &RootTarget,
+        remote_client: &GenericRemoteStorage,
         tenant_id: TenantId,
         summary: &mut MetadataSummary,
         mut tenant_objects: TenantObjectListing,
-        timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
+        timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
         highest_shard_count: ShardCount,
     ) {
         summary.tenant_count += 1;
@@ -191,8 +190,7 @@ pub async fn scan_pageserver_metadata(
                 // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
                 // reference counts for layers across the tenant.
                 let analysis = branch_cleanup_and_check_errors(
-                    s3_client,
-                    target,
+                    remote_client,
                     &ttid,
                     &mut tenant_objects,
                     None,
@@ -273,8 +271,7 @@ pub async fn scan_pageserver_metadata(
                     let tenant_objects = std::mem::take(&mut tenant_objects);
                     let timelines = std::mem::take(&mut tenant_timeline_results);
                     analyze_tenant(
-                        &s3_client,
-                        &target,
+                        &remote_client,
                         prev_tenant_id,
                         &mut summary,
                         tenant_objects,
@@ -311,8 +308,7 @@ pub async fn scan_pageserver_metadata(
 
     if !tenant_timeline_results.is_empty() {
         analyze_tenant(
-            &s3_client,
-            &target,
+            &remote_client,
             tenant_id.expect("Must be set if results are present"),
             &mut summary,
             tenant_objects,
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 08a4541c5c..f20fa27d13 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -188,6 +188,11 @@ async fn check_timeline(
     // we need files, so unset it.
     timeline_dir_target.delimiter = String::new();
 
+    let prefix_str = &timeline_dir_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timeline_dir_target.prefix_in_bucket);
+
     let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, _obj) = obj?;
@@ -195,7 +200,7 @@ async fn check_timeline(
         let seg_name = key
             .get_path()
             .as_str()
-            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
+            .strip_prefix(prefix_str)
             .expect("failed to extract segment name");
         expected_segfiles.remove(seg_name);
     }
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 1866e6ec80..fc3a973922 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -1,10 +1,11 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
+use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult, RemoteTimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines_generic};
 use crate::{
-    download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    download_object_to_file, init_remote, init_remote_generic, BucketConfig, NodeKind, RootTarget,
+    TenantShardTimelineId,
 };
 use anyhow::Context;
 use async_stream::stream;
@@ -15,6 +16,7 @@ use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
 use utils::generation::Generation;
 use utils::id::TenantId;
 
@@ -215,11 +217,11 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) =
-            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
+        let (remote_client, target) =
+            init_remote_generic(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
-        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
+        let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
         let shards: Vec<TenantShardId> = shards.try_collect().await?;
 
         // Only read from shards that have the highest count: avoids redundantly downloading
@@ -237,18 +239,19 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
+            let timelines = stream_tenant_timelines_generic(&remote_client, &target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
-                s3_client: &Client,
+                remote_client: &GenericRemoteStorage,
                 target: &RootTarget,
                 ttid: TenantShardTimelineId,
-            ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
-                let data = list_timeline_blobs(s3_client, ttid, target).await?;
+            ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
+                let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
-            let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
+            let timelines =
+                timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid));
             let mut timelines = std::pin::pin!(timelines.try_buffered(8));
 
             while let Some(i) = timelines.next().await {
@@ -278,7 +281,7 @@ impl SnapshotDownloader {
 
         for (ttid, layers) in ancestor_layers.into_iter() {
             tracing::info!(
-                "Downloading {} layers from ancvestor timeline {ttid}...",
+                "Downloading {} layers from ancestor timeline {ttid}...",
                 layers.len()
             );
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ba6fbc003a..9aa275d343 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4643,6 +4643,7 @@ class StorageScrubber:
         ]
         args = base_args + args
 
+        log.info(f"Invoking scrubber command {args} with env: {env}")
         (output_path, stdout, status_code) = subprocess_capture(
             self.log_dir,
             args,

From 6949b45e1795816507f5025a474e15d718e07456 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 17:44:10 +0200
Subject: [PATCH 22/44] Update aws -> infra for repo rename (#8755)

See slack thread:
https://neondb.slack.com/archives/C039YKBRZB4/p1722501766006179
---
 .github/workflows/build_and_test.yml               | 10 +++++-----
 docs/rfcs/033-storage-controller-drain-and-fill.md |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ee6d3ba005..92fff4ffbc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -985,10 +985,10 @@ jobs:
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
             gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=false \
               -f deployProxy=false \
               -f deployStorage=true \
@@ -998,14 +998,14 @@ jobs:
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true
 
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
               -f deployStorage=true \
               -f deployStorageBroker=true \
               -f deployStorageController=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
               -f deployStorage=false \
@@ -1015,7 +1015,7 @@ jobs:
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true
 
-            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
               -f branch=main \
diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md
index 77c84cd2a5..733f7c0bd8 100644
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.
 
 Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
 
 This problem is not yet very acutely felt in storage controller managed pageservers since

From 4b26783c94b582dad20efb49ca2ca842c6f944b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 23:58:47 +0200
Subject: [PATCH 23/44] scrubber: remove _generic postfix and two unused
 functions (#8761)

Removes the `_generic` postfix from the `GenericRemoteStorage` using
APIs, as `remote_storage` is the "default" now, and add a `_s3` postfix
to the remaining APIs using the S3 SDK (only in tenant snapshot). Also,
remove two unused functions: `list_objects_with_retries` and
`stream_tenants functions`.

Part of https://github.com/neondatabase/neon/issues/7547
---
 storage_scrubber/src/checks.rs                | 14 ++---
 storage_scrubber/src/find_large_objects.rs    |  7 +--
 storage_scrubber/src/garbage.rs               | 26 ++++-----
 storage_scrubber/src/lib.rs                   | 51 +++-------------
 storage_scrubber/src/metadata_stream.rs       | 58 +++----------------
 .../src/pageserver_physical_gc.rs             | 19 +++---
 .../src/scan_pageserver_metadata.rs           | 14 ++---
 .../src/scan_safekeeper_metadata.rs           |  9 ++-
 storage_scrubber/src/tenant_snapshot.rs       | 17 +++---
 9 files changed, 67 insertions(+), 148 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 9063b3c197..b35838bcf7 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -10,8 +10,8 @@ use utils::generation::Generation;
 use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing_generic;
-use crate::{download_object_with_retries_generic, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::stream_listing;
+use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
@@ -320,17 +320,17 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
-pub(crate) async fn list_timeline_blobs_generic(
+pub(crate) async fn list_timeline_blobs(
     remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
-    s3_root: &RootTarget,
+    root_target: &RootTarget,
 ) -> anyhow::Result<RemoteTimelineBlobData> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
     let mut unknown_keys = Vec::new();
 
-    let mut timeline_dir_target = s3_root.timeline_root(&id);
+    let mut timeline_dir_target = root_target.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
     let mut index_part_keys: Vec<ListingObject> = Vec::new();
@@ -341,7 +341,7 @@ pub(crate) async fn list_timeline_blobs_generic(
         .strip_prefix("/")
         .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, Some(obj)) = obj? else {
             panic!("ListingObject not specified");
@@ -421,7 +421,7 @@ pub(crate) async fn list_timeline_blobs_generic(
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
         let index_part_bytes =
-            download_object_with_retries_generic(remote_client, &index_part_object_key.key)
+            download_object_with_retries(remote_client, &index_part_object_key.key)
                 .await
                 .context("index_part.json download")?;
 
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index f5bb7e088a..88e36af560 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -6,7 +6,7 @@ use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
+    checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants,
     stream_objects_with_retries, BucketConfig, NodeKind,
 };
 
@@ -50,9 +50,8 @@ pub async fn find_large_objects(
     ignore_deltas: bool,
     concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
+    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = pin!(stream_tenants(&remote_client, &target));
 
     let objects_stream = tenants.map_ok(|tenant_shard_id| {
         let mut tenant_root = target.tenant_root(&tenant_shard_id);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index d6a73bf366..3e22960f8d 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -19,8 +19,8 @@ use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote_generic, list_objects_with_retries_generic,
-    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
+    init_remote, list_objects_with_retries,
+    metadata_stream::{stream_tenant_timelines, stream_tenants},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
 
@@ -153,7 +153,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
+    let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +179,7 @@ async fn find_garbage_inner(
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants_generic(&remote_client, &target);
+    let tenants = stream_tenants(&remote_client, &target);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
         let console_cache = console_cache.clone();
@@ -237,14 +237,13 @@ async fn find_garbage_inner(
         // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
         // identify it as purge-able anyway
         if console_result.is_none() {
-            let timelines =
-                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await;
+            let timelines = stream_tenant_timelines(&remote_client, &target, tenant_shard_id)
+                .await?
+                .collect::<Vec<_>>()
+                .await;
             if timelines.is_empty() {
                 // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries_generic(
+                let tenant_objects = list_objects_with_retries(
                     &remote_client,
                     ListingMode::WithDelimiter,
                     &target.tenant_root(&tenant_shard_id),
@@ -265,7 +264,7 @@ async fn find_garbage_inner(
 
                 for timeline_r in timelines {
                     let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries_generic(
+                    let timeline_objects = list_objects_with_retries(
                         &remote_client,
                         ListingMode::WithDelimiter,
                         &target.timeline_root(&timeline),
@@ -331,8 +330,7 @@ async fn find_garbage_inner(
 
     // Construct a stream of all timelines within active tenants
     let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines =
-        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
+    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, *t));
     let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -507,7 +505,7 @@ pub async fn purge_garbage(
     );
 
     let (remote_client, _target) =
-        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     assert_eq!(
         &garbage_list.bucket_config.bucket,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 3183bc3c64..112f052e07 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -15,7 +15,7 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
@@ -352,7 +352,7 @@ fn make_root_target(
     }
 }
 
-async fn init_remote(
+async fn init_remote_s3(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
@@ -369,7 +369,7 @@ async fn init_remote(
     Ok((s3_client, s3_root))
 }
 
-async fn init_remote_generic(
+async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
@@ -394,45 +394,10 @@ async fn init_remote_generic(
 
     // We already pass the prefix to the remote client above
     let prefix_in_root_target = String::new();
-    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
+    let root_target = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
 
     let client = GenericRemoteStorage::from_config(&storage_config).await?;
-    Ok((client, s3_root))
-}
-
-async fn list_objects_with_retries(
-    s3_client: &Client,
-    s3_target: &S3Target,
-    continuation_token: Option<String>,
-) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
-    for trial in 0..MAX_RETRIES {
-        match s3_client
-            .list_objects_v2()
-            .bucket(&s3_target.bucket_name)
-            .prefix(&s3_target.prefix_in_bucket)
-            .delimiter(&s3_target.delimiter)
-            .set_continuation_token(continuation_token.clone())
-            .send()
-            .await
-        {
-            Ok(response) => return Ok(response),
-            Err(e) => {
-                if trial == MAX_RETRIES - 1 {
-                    return Err(e)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                }
-                error!(
-                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
-                    s3_target.bucket_name,
-                    s3_target.prefix_in_bucket,
-                    s3_target.delimiter,
-                    DisplayErrorContext(e),
-                );
-                tokio::time::sleep(Duration::from_secs(1)).await;
-            }
-        }
-    }
-    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
+    Ok((client, root_target))
 }
 
 /// Listing possibly large amounts of keys in a streaming fashion.
@@ -479,7 +444,7 @@ fn stream_objects_with_retries<'a>(
 
 /// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
 /// use [`stream_objects_with_retries`] instead.
-async fn list_objects_with_retries_generic(
+async fn list_objects_with_retries(
     remote_client: &GenericRemoteStorage,
     listing_mode: ListingMode,
     s3_target: &S3Target,
@@ -516,7 +481,7 @@ async fn list_objects_with_retries_generic(
     panic!("MAX_RETRIES is not allowed to be 0");
 }
 
-async fn download_object_with_retries_generic(
+async fn download_object_with_retries(
     remote_client: &GenericRemoteStorage,
     key: &RemotePath,
 ) -> anyhow::Result<Vec<u8>> {
@@ -552,7 +517,7 @@ async fn download_object_with_retries_generic(
     anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
 }
 
-async fn download_object_to_file(
+async fn download_object_to_file_s3(
     s3_client: &Client,
     bucket_name: &str,
     key: &str,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index eca774413a..10d77937f1 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -2,20 +2,19 @@ use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
-use aws_sdk_s3::Client;
 use futures::StreamExt;
 use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
-    list_objects_with_retries, list_objects_with_retries_generic, stream_objects_with_retries,
-    RootTarget, S3Target, TenantShardTimelineId,
+    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
+    TenantShardTimelineId,
 };
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
 
 /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
-pub fn stream_tenants_generic<'a>(
+pub fn stream_tenants<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
 ) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
@@ -36,44 +35,6 @@ pub fn stream_tenants_generic<'a>(
     }
 }
 
-/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
-pub fn stream_tenants<'a>(
-    s3_client: &'a Client,
-    target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
-    try_stream! {
-        let mut continuation_token = None;
-        let tenants_target = target.tenants_root();
-        loop {
-            let fetch_response =
-                list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?;
-
-            let new_entry_ids = fetch_response
-                .common_prefixes()
-                .iter()
-                .filter_map(|prefix| prefix.prefix())
-                .filter_map(|prefix| -> Option<&str> {
-                    prefix
-                        .strip_prefix(&tenants_target.prefix_in_bucket)?
-                        .strip_suffix('/')
-                }).map(|entry_id_str| {
-                entry_id_str
-                    .parse()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-            for i in new_entry_ids {
-                yield i?;
-            }
-
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-    }
-}
-
 pub async fn stream_tenant_shards<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
@@ -85,12 +46,9 @@ pub async fn stream_tenant_shards<'a>(
     let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix);
 
     tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket);
-    let listing = list_objects_with_retries_generic(
-        remote_client,
-        ListingMode::WithDelimiter,
-        &shards_target,
-    )
-    .await?;
+    let listing =
+        list_objects_with_retries(remote_client, ListingMode::WithDelimiter, &shards_target)
+            .await?;
 
     let tenant_shard_ids = listing
         .prefixes
@@ -118,7 +76,7 @@ pub async fn stream_tenant_shards<'a>(
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
 /// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines_generic<'a>(
+pub async fn stream_tenant_timelines<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
     tenant: TenantShardId,
@@ -173,7 +131,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
     })
 }
 
-pub(crate) fn stream_listing_generic<'a>(
+pub(crate) fn stream_listing<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a S3Target,
 ) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 6828081128..88681e38c2 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -2,9 +2,9 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult};
-use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
-use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -352,7 +352,7 @@ async fn gc_ancestor(
     summary: &mut GcSummary,
 ) -> anyhow::Result<()> {
     // Scan timelines in the ancestor
-    let timelines = stream_tenant_timelines_generic(remote_client, root_target, ancestor).await?;
+    let timelines = stream_tenant_timelines(remote_client, root_target, ancestor).await?;
     let mut timelines = std::pin::pin!(timelines);
 
     // Build a list of keys to retain
@@ -360,7 +360,7 @@ async fn gc_ancestor(
     while let Some(ttid) = timelines.next().await {
         let ttid = ttid?;
 
-        let data = list_timeline_blobs_generic(remote_client, ttid, root_target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, root_target).await?;
 
         let s3_layers = match data.blob_data {
             BlobDataParseResult::Parsed {
@@ -456,11 +456,10 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
+        futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
@@ -473,7 +472,7 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -487,7 +486,7 @@ pub async fn pageserver_physical_gc(
         accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
-        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, target).await?;
 
         let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index e89e97ccb6..151ef27672 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -1,11 +1,11 @@
 use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
-    branch_cleanup_and_check_errors, list_timeline_blobs_generic, BlobDataParseResult,
+    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult,
     RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
 };
-use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
-use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
@@ -120,10 +120,10 @@ pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
+        futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
     };
@@ -133,7 +133,7 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -143,7 +143,7 @@ pub async fn scan_pageserver_metadata(
         target: &RootTarget,
         ttid: TenantShardTimelineId,
     ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
-        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, target).await?;
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid));
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index f20fa27d13..1a9f3d0ef5 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -14,9 +14,8 @@ use utils::{
 };
 
 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
-    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
-    TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
 };
 
 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -107,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
+    let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
@@ -193,7 +192,7 @@ async fn check_timeline(
         .strip_prefix("/")
         .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, _obj) = obj?;
 
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index fc3a973922..bb4079b5f4 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -1,10 +1,10 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult, RemoteTimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines_generic};
+use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
 use crate::{
-    download_object_to_file, init_remote, init_remote_generic, BucketConfig, NodeKind, RootTarget,
+    download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget,
     TenantShardTimelineId,
 };
 use anyhow::Context;
@@ -36,7 +36,8 @@ impl SnapshotDownloader {
         output_path: Utf8PathBuf,
         concurrency: usize,
     ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+        let (s3_client, s3_root) =
+            init_remote_s3(bucket_config.clone(), NodeKind::Pageserver).await?;
         Ok(Self {
             s3_client,
             s3_root,
@@ -93,7 +94,7 @@ impl SnapshotDownloader {
             let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
                 return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
             };
-            download_object_to_file(
+            download_object_to_file_s3(
                 &self.s3_client,
                 &self.bucket_config.bucket,
                 &remote_layer_path,
@@ -218,7 +219,7 @@ impl SnapshotDownloader {
 
     pub async fn download(&self) -> anyhow::Result<()> {
         let (remote_client, target) =
-            init_remote_generic(self.bucket_config.clone(), NodeKind::Pageserver).await?;
+            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
         let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
@@ -239,7 +240,7 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines_generic(&remote_client, &target, shard).await?;
+            let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
@@ -247,7 +248,7 @@ impl SnapshotDownloader {
                 target: &RootTarget,
                 ttid: TenantShardTimelineId,
             ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
-                let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+                let data = list_timeline_blobs(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
             let timelines =

From 4c5a0fdc75c16b007ed9c042d41ec37bae1c0f75 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 7 Aug 2024 19:26:06 +0300
Subject: [PATCH 24/44] safekeeper: check for non-consecutive writes in
 safekeeper.rs

wal_storage.rs already checks this, but since this is a quite legit scenario
check it at safekeeper.rs (consensus level) as well.

ref https://github.com/neondatabase/neon/issues/8212
---
 safekeeper/src/safekeeper.rs                  | 27 +++++++++++++++++++
 safekeeper/src/wal_storage.rs                 |  6 +++++
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 +++
 3 files changed, 37 insertions(+)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 0814d9ba67..9d103887ae 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,6 +875,29 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
+        // Disallow any non-sequential writes, which can result in gaps or
+        // overwrites. If we need to move the pointer, ProposerElected message
+        // should have truncated WAL first accordingly. Note that the first
+        // condition (WAL rewrite) is quite expected in real world; it happens
+        // when walproposer reconnects to safekeeper and writes some more data
+        // while first connection still gets some packets later. It might be
+        // better to not log this as error! above.
+        let write_lsn = self.wal_store.write_lsn();
+        if write_lsn > msg.h.begin_lsn {
+            bail!(
+                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn
+            );
+        }
+        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
+            bail!(
+                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn,
+            );
+        }
+
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -1005,6 +1028,10 @@ mod tests {
 
     #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
+        fn write_lsn(&self) -> Lsn {
+            self.lsn
+        }
+
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index ded8571a3e..5136bdb9d8 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,6 +37,8 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 
 #[async_trait::async_trait]
 pub trait Storage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -327,6 +329,10 @@ impl PhysicalStorage {
 
 #[async_trait::async_trait]
 impl Storage for PhysicalStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index c2db9de78a..be56e86562 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -177,6 +177,10 @@ impl DiskWALStorage {
 
 #[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From ef57e73fbf4ab4972d07e598d0b1ab3139a5abbf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Aug 2024 10:26:44 +0200
Subject: [PATCH 25/44] task_mgr::spawn: require a `TenantId` (#8462)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… to dis-incentivize global tasks via task_mgr in the future

(As of https://github.com/neondatabase/neon/pull/8339 all remaining
task_mgr usage is tenant or timeline scoped.)
---
 pageserver/src/task_mgr.rs                      | 15 +++++----------
 pageserver/src/tenant.rs                        |  2 +-
 pageserver/src/tenant/remote_timeline_client.rs |  2 +-
 pageserver/src/tenant/tasks.rs                  |  6 +++---
 pageserver/src/tenant/timeline.rs               |  8 ++++----
 pageserver/src/tenant/timeline/delete.rs        |  2 +-
 pageserver/src/tenant/timeline/eviction_task.rs |  2 +-
 7 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 5cd78874c1..ed9e001fd2 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -393,7 +393,7 @@ struct PageServerTask {
 
     /// Tasks may optionally be launched for a particular tenant/timeline, enabling
     /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
     timeline_id: Option<TimelineId>,
 
     mutable: Mutex<MutableTaskState>,
@@ -405,7 +405,7 @@ struct PageServerTask {
 pub fn spawn<F>(
     runtime: &tokio::runtime::Handle,
     kind: TaskKind,
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
     timeline_id: Option<TimelineId>,
     name: &str,
     future: F,
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
         let tasks = TASKS.lock().unwrap();
         for task in tasks.values() {
             if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
+                && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
                 && (timeline_id.is_none() || task.timeline_id == timeline_id)
             {
                 task.cancel.cancel();
@@ -573,13 +573,8 @@ pub async fn shutdown_tasks(
         };
         if let Some(mut join_handle) = join_handle {
             if log_all {
-                if tenant_shard_id.is_none() {
-                    // there are quite few of these
-                    info!(name = task.name, kind = ?task_kind, "stopping global task");
-                } else {
-                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
-                }
+                // warn to catch these in tests; there shouldn't be any
+                warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
             }
             if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
                 .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8ab8d08ce1..65a7504b74 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -798,7 +798,7 @@ impl Tenant {
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
-            Some(tenant_shard_id),
+            tenant_shard_id,
             None,
             "attach tenant",
             async move {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b4d7ad1e97..71b766e4c7 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
             task_mgr::spawn(
                 &self.runtime,
                 TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id),
+                self.tenant_shard_id,
                 Some(self.timeline_id),
                 "remote upload",
                 async move {
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index dbcd704b4e..3972685a8e 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -98,7 +98,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("compactor for tenant {tenant_shard_id}"),
         {
@@ -121,7 +121,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("garbage collector for tenant {tenant_shard_id}"),
         {
@@ -144,7 +144,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::IngestHousekeeping,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("ingest housekeeping for tenant {tenant_shard_id}"),
         {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 26dc87c373..9732cf8b50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2281,7 +2281,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::LayerFlushTask,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "layer flush task",
             async move {
@@ -2635,7 +2635,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "initial size calculation",
             // NB: don't log errors here, task_mgr will do that.
@@ -2803,7 +2803,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "ondemand logical size calculation",
             async move {
@@ -5162,7 +5162,7 @@ impl Timeline {
         let task_id = task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::DownloadAllRemoteLayers,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "download all remote layers task",
             async move {
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b03dbb092e..dc4118bb4a 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
+            tenant_shard_id,
             Some(timeline_id),
             "timeline_delete",
             async move {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index eaa9c0ff62..2f6cb4d73a 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
         task_mgr::spawn(
             BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             &format!(
                 "layer eviction for {}/{}",

From c96593b473a22e76ce6dae912177128c3ec21867 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 20 Aug 2024 10:46:58 +0100
Subject: [PATCH 26/44] Make Postgres 16 default version (#8745)

## Problem

The default Postgres version is set to 15 in code, while we use 16 in
most of the other places (and Postgres 17 is coming)

## Summary of changes
- Run `benchmarks` job with Postgres 16 (instead of Postgres 14)
- Set `DEFAULT_PG_VERSION` to 16 in all places
- Remove deprecated `--pg-version` pytest argument
- Update `test_metadata_bincode_serde_ensure_roundtrip` for Postgres 16
---
 .github/actions/run-python-test-set/action.yml |  2 +-
 .github/workflows/build_and_test.yml           |  1 +
 README.md                                      |  2 +-
 control_plane/src/bin/neon_local.rs            |  2 +-
 control_plane/src/local_env.rs                 |  2 +-
 pageserver/src/lib.rs                          |  2 +-
 pageserver/src/tenant/metadata.rs              |  4 ++--
 scripts/ps_ec2_setup_instance_store            |  2 +-
 test_runner/README.md                          |  3 +--
 test_runner/fixtures/pg_version.py             | 18 +-----------------
 test_runner/performance/README.md              |  2 +-
 test_runner/performance/pageserver/README.md   |  2 +-
 .../interactive/test_many_small_tenants.py     |  2 +-
 test_runner/regress/test_compatibility.py      |  4 ++--
 14 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 4ccf190c6a..814067fb8e 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -43,7 +43,7 @@ inputs:
   pg_version:
     description: 'Postgres version to use for tests'
     required: false
-    default: 'v14'
+    default: 'v16'
   benchmark_durations:
     description: 'benchmark durations JSON'
     required: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 92fff4ffbc..715f1af656 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -280,6 +280,7 @@ jobs:
           save_perf_report: ${{ github.ref_name == 'main' }}
           extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
           benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
+          pg_version: v16
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
diff --git a/README.md b/README.md
index ea0a289502..f01442da5d 100644
--- a/README.md
+++ b/README.md
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:
 
 ```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
 ```
 
 ## Flamegraphs
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index edd88dc71c..1d66532d49 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_PG_VERSION: &str = "15";
+const DEFAULT_PG_VERSION: &str = "16";
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 807519c88d..74caba2b56 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;
 
-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;
 
 //
 // This data structures represents neon_local CLI config
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5aee13cfc6..5829a1c188 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;
 
-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;
 
 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 6073abc8c3..190316df42 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -565,7 +565,7 @@ mod tests {
         );
         let expected_bytes = vec![
             /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
+            74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
             /* TimelineMetadataBodyV2 */
             0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
             1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -574,7 +574,7 @@ mod tests {
             0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
             0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
             0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
+            0, 0, 0, 16, // pg_version (4 bytes)
             /* padding bytes */
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 1f88f252eb..7c383e322f 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -44,7 +44,7 @@ run the following commands from the top of the neon.git checkout
 
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"
-    DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
+    DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
 
     # for interactive use
     export NEON_REPO_DIR="$NEON_REPO_DIR"
diff --git a/test_runner/README.md b/test_runner/README.md
index e2f26a19ce..73aa29d4bb 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -71,8 +71,7 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`.
 Inside that dir, a `bin/postgres` binary should be present.
 `DEFAULT_PG_VERSION`: The version of Postgres to use,
 This is used to construct full path to the postgres binaries.
-Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively,
-you can use `--pg-version` argument.
+Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 941889a2f5..e12c8e5f4a 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -3,8 +3,6 @@ import os
 from typing import Optional
 
 import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
 
 """
 This fixture is used to determine which version of Postgres to use for tests.
@@ -52,7 +50,7 @@ class PgVersion(str, enum.Enum):
         return None
 
 
-DEFAULT_VERSION: PgVersion = PgVersion.V15
+DEFAULT_VERSION: PgVersion = PgVersion.V16
 
 
 def skip_on_postgres(version: PgVersion, reason: str):
@@ -69,22 +67,8 @@ def xfail_on_postgres(version: PgVersion, reason: str):
     )
 
 
-def pytest_addoption(parser: Parser):
-    parser.addoption(
-        "--pg-version",
-        action="store",
-        type=PgVersion,
-        help="DEPRECATED: Postgres version to use for tests",
-    )
-
-
 def run_only_on_default_postgres(reason: str):
     return pytest.mark.skipif(
         PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
         reason=reason,
     )
-
-
-def pytest_configure(config: Config):
-    if config.getoption("--pg-version"):
-        raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 7ad65821d4..70d75a6dcf 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -7,7 +7,7 @@ easier to see if you have compile errors without scrolling up.
 You may also need to run `./scripts/pysync`.
 
 Then run the tests
-`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance`
+`DEFAULT_PG_VERSION=16 NEON_BIN=./target/release poetry run pytest test_runner/performance`
 
 Some handy pytest flags for local development:
 - `-x` tells pytest to stop on first error
diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md
index fdd09cd946..56ffad9963 100644
--- a/test_runner/performance/pageserver/README.md
+++ b/test_runner/performance/pageserver/README.md
@@ -11,6 +11,6 @@ It supports mounting snapshots using overlayfs, which improves iteration time.
 Here's a full command line.
 
 ```
-RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release \
     ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
 ````
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 33848b06d3..8d781c1609 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -14,7 +14,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
 
 """
 Usage:
-DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+DEFAULT_PG_VERSION=16 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
     ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
 """
 
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index afa5f6873c..30ff40b7df 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -39,7 +39,7 @@ from fixtures.workload import Workload
 #
 # How to run `test_backward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=15
+#    export DEFAULT_PG_VERSION=16
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
@@ -61,7 +61,7 @@ from fixtures.workload import Workload
 #
 # How to run `test_forward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=15
+#    export DEFAULT_PG_VERSION=16
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}

From 02a28c01ca1083e4fb2fc28b2db761220c161ff7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Aug 2024 12:34:53 +0100
Subject: [PATCH 27/44] Revert "safekeeper: check for non-consecutive writes in
 safekeeper.rs" (#8771)

Reverts neondatabase/neon#8640

This broke `test_last_log_term_switch` via a merge race of some kind.
---
 safekeeper/src/safekeeper.rs                  | 27 -------------------
 safekeeper/src/wal_storage.rs                 |  6 -----
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 ---
 3 files changed, 37 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 9d103887ae..0814d9ba67 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,29 +875,6 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
-        // Disallow any non-sequential writes, which can result in gaps or
-        // overwrites. If we need to move the pointer, ProposerElected message
-        // should have truncated WAL first accordingly. Note that the first
-        // condition (WAL rewrite) is quite expected in real world; it happens
-        // when walproposer reconnects to safekeeper and writes some more data
-        // while first connection still gets some packets later. It might be
-        // better to not log this as error! above.
-        let write_lsn = self.wal_store.write_lsn();
-        if write_lsn > msg.h.begin_lsn {
-            bail!(
-                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
-                write_lsn,
-                msg.h.begin_lsn
-            );
-        }
-        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
-            bail!(
-                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
-                write_lsn,
-                msg.h.begin_lsn,
-            );
-        }
-
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -1028,10 +1005,6 @@ mod tests {
 
     #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
-        fn write_lsn(&self) -> Lsn {
-            self.lsn
-        }
-
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 5136bdb9d8..ded8571a3e 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,8 +37,6 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 
 #[async_trait::async_trait]
 pub trait Storage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -329,10 +327,6 @@ impl PhysicalStorage {
 
 #[async_trait::async_trait]
 impl Storage for PhysicalStorage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn {
-        self.write_lsn
-    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index be56e86562..c2db9de78a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -177,10 +177,6 @@ impl DiskWALStorage {
 
 #[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn {
-        self.write_lsn
-    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From 1c96957e85700eaa0333fb0230f5135e7a982d1e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Aug 2024 14:00:36 +0100
Subject: [PATCH 28/44] storcon: run db migrations after step down sequence
 (#8756)

## Problem

Previously, we would run db migrations before doing the step-down
sequence. This meant that the current leader would have to deal with
the schema changes and that's generally not safe.

## Summary of changes

Push the step-down procedure earlier in start-up and
do db migrations right after it (but before we load-up the in-memory
state from the db).

Epic: https://github.com/neondatabase/cloud/issues/14701
---
 control_plane/src/storage_controller.rs |   3 +-
 storage_controller/src/leadership.rs    | 136 ++++++++++++++++++++++++
 storage_controller/src/lib.rs           |   1 +
 storage_controller/src/main.rs          |  24 +----
 storage_controller/src/metrics.rs       |   2 +
 storage_controller/src/persistence.rs   |  18 ++++
 storage_controller/src/service.rs       | 102 ++++--------------
 7 files changed, 180 insertions(+), 106 deletions(-)
 create mode 100644 storage_controller/src/leadership.rs

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 2c077595a1..f6539ad5b0 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -217,7 +217,7 @@ impl StorageController {
         Ok(exitcode.success())
     }
 
-    /// Create our database if it doesn't exist, and run migrations.
+    /// Create our database if it doesn't exist
     ///
     /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
     /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
@@ -382,7 +382,6 @@ impl StorageController {
             )
             .await?;
 
-            // Run migrations on every startup, in case something changed.
             self.setup_database(postgres_port).await?;
         }
 
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
new file mode 100644
index 0000000000..a171bab451
--- /dev/null
+++ b/storage_controller/src/leadership.rs
@@ -0,0 +1,136 @@
+use std::sync::Arc;
+
+use hyper::Uri;
+use tokio_util::sync::CancellationToken;
+
+use crate::{
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence},
+    service::Config,
+};
+
+/// Helper for storage controller leadership acquisition
+pub(crate) struct Leadership {
+    persistence: Arc<Persistence>,
+    config: Config,
+    cancel: CancellationToken,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum Error {
+    #[error(transparent)]
+    Database(#[from] DatabaseError),
+}
+
+pub(crate) type Result<T> = std::result::Result<T, Error>;
+
+impl Leadership {
+    pub(crate) fn new(
+        persistence: Arc<Persistence>,
+        config: Config,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            persistence,
+            config,
+            cancel,
+        }
+    }
+
+    /// Find the current leader in the database and request it to step down if required.
+    /// Should be called early on in within the start-up sequence.
+    ///
+    /// Returns a tuple of two optionals: the current leader and its observed state
+    pub(crate) async fn step_down_current_leader(
+        &self,
+    ) -> Result<(Option<ControllerPersistence>, Option<GlobalObservedState>)> {
+        let leader = self.current_leader().await?;
+        let leader_step_down_state = if let Some(ref leader) = leader {
+            if self.config.start_as_candidate {
+                self.request_step_down(leader).await
+            } else {
+                None
+            }
+        } else {
+            tracing::info!("No leader found to request step down from. Will build observed state.");
+            None
+        };
+
+        Ok((leader, leader_step_down_state))
+    }
+
+    /// Mark the current storage controller instance as the leader in the database
+    pub(crate) async fn become_leader(
+        &self,
+        current_leader: Option<ControllerPersistence>,
+    ) -> Result<()> {
+        if let Some(address_for_peers) = &self.config.address_for_peers {
+            // TODO: `address-for-peers` can become a mandatory cli arg
+            // after we update the k8s setup
+            let proposed_leader = ControllerPersistence {
+                address: address_for_peers.to_string(),
+                started_at: chrono::Utc::now(),
+            };
+
+            self.persistence
+                .update_leader(current_leader, proposed_leader)
+                .await
+                .map_err(Error::Database)
+        } else {
+            tracing::info!("No address-for-peers provided. Skipping leader persistence.");
+            Ok(())
+        }
+    }
+
+    async fn current_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        let res = self.persistence.get_leader().await;
+        if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res
+        {
+            const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist";
+            if err.message().trim() == REL_NOT_FOUND_MSG {
+                // Special case: if this is a brand new storage controller, migrations will not
+                // have run at this point yet, and, hence, the controllers table does not exist.
+                // Detect this case via the error string (diesel doesn't type it) and allow it.
+                tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ...");
+                return Ok(None);
+            }
+        }
+
+        res
+    }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        // TODO: jwt token
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
+            Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
+                tracing::error!(
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
+                );
+
+                None
+            }
+        }
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 2034addbe1..60e613bb5c 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -8,6 +8,7 @@ mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
+mod leadership;
 pub mod metrics;
 mod node;
 mod pageserver_client;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 7387d36690..17685b1140 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,6 +1,5 @@
 use anyhow::{anyhow, Context};
 use clap::Parser;
-use diesel::Connection;
 use hyper::Uri;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
@@ -27,9 +26,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
-pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -181,20 +177,6 @@ impl Secrets {
     }
 }
 
-/// Execute the diesel migrations that are built into this binary
-async fn migration_run(database_url: &str) -> anyhow::Result<()> {
-    use diesel::PgConnection;
-    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-    let mut conn = PgConnection::establish(database_url)?;
-
-    HarnessWithOutput::write_to_stdout(&mut conn)
-        .run_pending_migrations(MIGRATIONS)
-        .map(|_| ())
-        .map_err(|e| anyhow::anyhow!(e))?;
-
-    Ok(())
-}
-
 fn main() -> anyhow::Result<()> {
     logging::init(
         LogFormat::Plain,
@@ -304,13 +286,9 @@ async fn async_main() -> anyhow::Result<()> {
         http_service_port: args.listen.port() as i32,
     };
 
-    // After loading secrets & config, but before starting anything else, apply database migrations
+    // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    migration_run(&secrets.database_url)
-        .await
-        .context("Running database migrations")?;
-
     let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index c2303e7a7f..5cfcfb4b1f 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -230,6 +230,7 @@ pub(crate) enum DatabaseErrorLabel {
     Connection,
     ConnectionPool,
     Logical,
+    Migration,
 }
 
 impl DatabaseError {
@@ -239,6 +240,7 @@ impl DatabaseError {
             Self::Connection(_) => DatabaseErrorLabel::Connection,
             Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
             Self::Logical(_) => DatabaseErrorLabel::Logical,
+            Self::Migration(_) => DatabaseErrorLabel::Migration,
         }
     }
 }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index aebbdec0d1..16df19026c 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -25,6 +25,9 @@ use crate::metrics::{
 };
 use crate::node::Node;
 
+use diesel_migrations::{embed_migrations, EmbeddedMigrations};
+const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
+
 /// ## What do we store?
 ///
 /// The storage controller service does not store most of its state durably.
@@ -72,6 +75,8 @@ pub(crate) enum DatabaseError {
     ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
+    #[error("Migration error: {0}")]
+    Migration(String),
 }
 
 #[derive(measured::FixedCardinalityLabel, Copy, Clone)]
@@ -167,6 +172,19 @@ impl Persistence {
         }
     }
 
+    /// Execute the diesel migrations that are built into this binary
+    pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
+        use diesel_migrations::{HarnessWithOutput, MigrationHarness};
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            HarnessWithOutput::write_to_stdout(conn)
+                .run_pending_migrations(MIGRATIONS)
+                .map(|_| ())
+                .map_err(|e| DatabaseError::Migration(e.to_string()))
+        })
+        .await
+    }
+
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3459b44774..780f4a7ee5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -17,8 +17,9 @@ use crate::{
     compute_hook::NotifyError,
     drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
+    leadership::Leadership,
     metrics,
-    peer_client::{GlobalObservedState, PeerClient},
+    peer_client::GlobalObservedState,
     persistence::{
         AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
         TenantFilter,
@@ -333,7 +334,7 @@ impl From<DatabaseError> for ApiError {
             DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
                 ApiError::ShuttingDown
             }
-            DatabaseError::Logical(reason) => {
+            DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => {
                 ApiError::InternalServerError(anyhow::anyhow!(reason))
             }
         }
@@ -606,22 +607,15 @@ impl Service {
 
         // Before making any obeservable changes to the cluster, persist self
         // as leader in database and memory.
-        if let Some(address_for_peers) = &self.config.address_for_peers {
-            // TODO: `address-for-peers` can become a mandatory cli arg
-            // after we update the k8s setup
-            let proposed_leader = ControllerPersistence {
-                address: address_for_peers.to_string(),
-                started_at: chrono::Utc::now(),
-            };
+        let leadership = Leadership::new(
+            self.persistence.clone(),
+            self.config.clone(),
+            self.cancel.child_token(),
+        );
 
-            if let Err(err) = self
-                .persistence
-                .update_leader(current_leader, proposed_leader)
-                .await
-            {
-                tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
-                std::process::exit(1);
-            }
+        if let Err(e) = leadership.become_leader(current_leader).await {
+            tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ...");
+            std::process::exit(1);
         }
 
         self.inner.write().unwrap().become_leader();
@@ -1159,6 +1153,16 @@ impl Service {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
         let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();
 
+        let leadership_cancel = CancellationToken::new();
+        let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel);
+        let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?;
+
+        // Apply the migrations **after** the current leader has stepped down
+        // (or we've given up waiting for it), but **before** reading from the
+        // database. The only exception is reading the current leader before
+        // migrating.
+        persistence.migration_run().await?;
+
         tracing::info!("Loading nodes from database...");
         let nodes = persistence
             .list_nodes()
@@ -1376,32 +1380,6 @@ impl Service {
                     return;
                 };
 
-                let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let leader = match this.get_leader().await {
-                    Ok(ok) => ok,
-                    Err(err) => {
-                        tracing::error!(
-                            "Failed to query database for current leader: {err}. Aborting start-up ..."
-                        );
-                        std::process::exit(1);
-                    }
-                };
-
-                let leader_step_down_state = match leadership_status {
-                    LeadershipStatus::Candidate => {
-                        if let Some(ref leader) = leader {
-                            this.request_step_down(leader).await
-                        } else {
-                            tracing::info!(
-                                "No leader found to request step down from. Will build observed state."
-                            );
-                            None
-                        }
-                    }
-                    LeadershipStatus::Leader => None,
-                    LeadershipStatus::SteppedDown => unreachable!(),
-                };
-
                 this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                     .await;
 
@@ -6377,42 +6355,4 @@ impl Service {
 
         global_observed
     }
-
-    /// Request step down from the currently registered leader in the database
-    ///
-    /// If such an entry is persisted, the success path returns the observed
-    /// state and details of the leader. Otherwise, None is returned indicating
-    /// there is no leader currently.
-    ///
-    /// On failures to query the database or step down error responses the process is killed
-    /// and we rely on k8s to retry.
-    async fn request_step_down(
-        &self,
-        leader: &ControllerPersistence,
-    ) -> Option<GlobalObservedState> {
-        tracing::info!("Sending step down request to {leader:?}");
-
-        // TODO: jwt token
-        let client = PeerClient::new(
-            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
-        );
-        let state = client.step_down(&self.cancel).await;
-        match state {
-            Ok(state) => Some(state),
-            Err(err) => {
-                // TODO: Make leaders periodically update a timestamp field in the
-                // database and, if the leader is not reachable from the current instance,
-                // but inferred as alive from the timestamp, abort start-up. This avoids
-                // a potential scenario in which we have two controllers acting as leaders.
-                tracing::error!(
-                    "Leader ({}) did not respond to step-down request: {}",
-                    leader.address,
-                    err
-                );
-
-                None
-            }
-        }
-    }
 }

From 0170611a97fc233f4e3bcc56648a77fb3cf33a2c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 20 Aug 2024 14:16:27 +0100
Subject: [PATCH 29/44] proxy: small changes (#8752)

## Problem

#8736 is getting too big. splitting off some simple changes here

## Summary of changes

Local proxy wont always be using tls, so make it optional. Local proxy
wont be using ws for now, so make it optional. Remove a dead config var.
---
 proxy/src/bin/proxy.rs                |  4 +-
 proxy/src/config.rs                   |  1 +
 proxy/src/serverless.rs               | 59 ++++++++++++++++++++-------
 proxy/src/serverless/conn_pool.rs     |  1 +
 proxy/src/serverless/sql_over_http.rs | 26 ++++++++----
 5 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b44e0ddd2f..d83a1f3bcf 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -173,9 +173,6 @@ struct ProxyCliArgs {
     /// cache for `role_secret` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     role_secret_cache: String,
-    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    disable_ip_check_for_http: bool,
     /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
     #[clap(long)]
     redis_notifications: Option<String>,
@@ -661,6 +658,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     )?;
 
     let http_config = HttpConfig {
+        accept_websockets: true,
         pool_options: GlobalConnPoolOptions {
             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 36d04924f2..a280aa88ce 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -52,6 +52,7 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
+    pub accept_websockets: bool,
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 5416d63b5b..b2bf93dc6d 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,6 +10,7 @@ mod json;
 mod sql_over_http;
 mod websocket;
 
+use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
@@ -26,8 +27,9 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::timeout;
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;
 
 use crate::cancellation::CancellationHandlerMain;
@@ -41,7 +43,7 @@ use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
 use std::net::{IpAddr, SocketAddr};
-use std::pin::pin;
+use std::pin::{pin, Pin};
 use std::sync::Arc;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
@@ -86,18 +88,18 @@ pub async fn task_main(
         config,
         endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
     });
-
-    let tls_config = match config.tls_config.as_ref() {
-        Some(config) => config,
+    let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
+        Some(config) => {
+            let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
+            // prefer http2, but support http/1.1
+            tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+            Arc::new(tls_server_config) as Arc<_>
+        }
         None => {
-            warn!("TLS config is missing, WebSocket Secure server will not be started");
-            return Ok(());
+            warn!("TLS config is missing");
+            Arc::new(NoTls) as Arc<_>
         }
     };
-    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
-    // prefer http2, but support http/1.1
-    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
@@ -176,16 +178,41 @@ pub async fn task_main(
     Ok(())
 }
 
+pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
+impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
+pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
+
+#[async_trait]
+trait MaybeTlsAcceptor: Send + Sync + 'static {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
+}
+
+#[async_trait]
+impl MaybeTlsAcceptor for rustls::ServerConfig {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?))
+    }
+}
+
+struct NoTls;
+
+#[async_trait]
+impl MaybeTlsAcceptor for NoTls {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(conn))
+    }
+}
+
 /// Handles the TCP startup lifecycle.
 /// 1. Parses PROXY protocol V2
 /// 2. Handles TLS handshake
 async fn connection_startup(
     config: &ProxyConfig,
-    tls_acceptor: TlsAcceptor,
+    tls_acceptor: Arc<dyn MaybeTlsAcceptor>,
     session_id: uuid::Uuid,
     conn: TcpStream,
     peer_addr: SocketAddr,
-) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
+) -> Option<(AsyncRW, IpAddr)> {
     // handle PROXY protocol
     let (conn, peer) = match read_proxy_protocol(conn).await {
         Ok(c) => c,
@@ -241,7 +268,7 @@ async fn connection_handler(
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
-    conn: TlsStream<ChainRW<TcpStream>>,
+    conn: AsyncRW,
     peer_addr: IpAddr,
     session_id: uuid::Uuid,
 ) {
@@ -326,7 +353,9 @@ async fn request_handler(
         .map(|s| s.to_string());
 
     // Check if the request is a websocket upgrade request.
-    if framed_websockets::upgrade::is_upgrade_request(&request) {
+    if config.http_config.accept_websockets
+        && framed_websockets::upgrade::is_upgrade_request(&request)
+    {
         let ctx = RequestMonitoring::new(
             session_id,
             peer_addr,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 9ede659cc4..3478787995 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -758,6 +758,7 @@ mod tests {
     async fn test_pool() {
         let _ = env_logger::try_init();
         let config = Box::leak(Box::new(crate::config::HttpConfig {
+            accept_websockets: false,
             pool_options: GlobalConnPoolOptions {
                 max_conns_per_endpoint: 2,
                 gc_epoch: Duration::from_secs(1),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index c41df07a4d..bbfed90f39 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -147,7 +147,7 @@ impl UserFacingError for ConnInfoError {
 fn get_conn_info(
     ctx: &RequestMonitoring,
     headers: &HeaderMap,
-    tls: &TlsConfig,
+    tls: Option<&TlsConfig>,
 ) -> Result<ConnInfo, ConnInfoError> {
     // HTTP only uses cleartext (for now and likely always)
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -184,12 +184,22 @@ fn get_conn_info(
         .ok_or(ConnInfoError::MissingPassword)?;
     let password = urlencoding::decode_binary(password.as_bytes());
 
-    let hostname = connection_url
-        .host_str()
-        .ok_or(ConnInfoError::MissingHostname)?;
-
-    let endpoint =
-        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
+    let endpoint = match connection_url.host() {
+        Some(url::Host::Domain(hostname)) => {
+            if let Some(tls) = tls {
+                endpoint_sni(hostname, &tls.common_names)?
+                    .ok_or(ConnInfoError::MalformedEndpoint)?
+            } else {
+                hostname
+                    .split_once(".")
+                    .map_or(hostname, |(prefix, _)| prefix)
+                    .into()
+            }
+        }
+        Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
+            return Err(ConnInfoError::MissingHostname)
+        }
+    };
     ctx.set_endpoint_id(endpoint.clone());
 
     let pairs = connection_url.query_pairs();
@@ -502,7 +512,7 @@ async fn handle_inner(
     let headers = request.headers();
 
     // TLS config should be there.
-    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
+    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
     info!(user = conn_info.user_info.user.as_str(), "credentials");
 
     // Allow connection pooling only if explicitly requested

From fa0750a37e01cee2e909d91be9b556ee2f128406 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Aug 2024 15:25:21 +0100
Subject: [PATCH 30/44] storcon: add peer jwt token (#8764)

## Problem

Storage controllers did not have the right token to speak to their peers
for leadership transitions.

## Summary of changes

Accept a peer jwt token for the storage controller.

Epic: https://github.com/neondatabase/cloud/issues/14701
---
 control_plane/src/storage_controller.rs       |  5 +++++
 storage_controller/src/leadership.rs          |  3 +--
 storage_controller/src/main.rs                | 20 +++++++++++--------
 storage_controller/src/service.rs             |  3 +++
 .../regress/test_storage_controller.py        |  2 ++
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f6539ad5b0..27d8e2de0c 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -453,6 +453,11 @@ impl StorageController {
             let jwt_token =
                 encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
             args.push(format!("--jwt-token={jwt_token}"));
+
+            let peer_claims = Claims::new(None, Scope::Admin);
+            let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
+                .expect("failed to generate jwt token");
+            args.push(format!("--peer-jwt-token={peer_jwt_token}"));
         }
 
         if let Some(public_key) = &self.public_key {
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
index a171bab451..5fae8991ec 100644
--- a/storage_controller/src/leadership.rs
+++ b/storage_controller/src/leadership.rs
@@ -110,10 +110,9 @@ impl Leadership {
     ) -> Option<GlobalObservedState> {
         tracing::info!("Sending step down request to {leader:?}");
 
-        // TODO: jwt token
         let client = PeerClient::new(
             Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
+            self.config.peer_jwt_token.clone(),
         );
         let state = client.step_down(&self.cancel).await;
         match state {
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 17685b1140..e3f29b84e7 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -47,6 +47,9 @@ struct Cli {
     #[arg(long)]
     control_plane_jwt_token: Option<String>,
 
+    #[arg(long)]
+    peer_jwt_token: Option<String>,
+
     /// URL to control plane compute notification endpoint
     #[arg(long)]
     compute_hook_url: Option<String>,
@@ -126,28 +129,28 @@ struct Secrets {
     public_key: Option<JwtAuth>,
     jwt_token: Option<String>,
     control_plane_jwt_token: Option<String>,
+    peer_jwt_token: Option<String>,
 }
 
 impl Secrets {
     const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
     const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
     const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
     const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
 
     /// Load secrets from, in order of preference:
     /// - CLI args if database URL is provided on the CLI
     /// - Environment variables if DATABASE_URL is set.
-    /// - AWS Secrets Manager secrets
     async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
+        let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV)
         else {
             anyhow::bail!(
                 "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
             )
         };
 
-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) {
             Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
             None => None,
         };
@@ -155,18 +158,18 @@ impl Secrets {
         let this = Self {
             database_url,
             public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
             control_plane_jwt_token: Self::load_secret(
                 &args.control_plane_jwt_token,
                 Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            )
-            .await,
+            ),
+            peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV),
         };
 
         Ok(this)
     }
 
-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+    fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
         if let Some(v) = cli {
             Some(v.clone())
         } else if let Ok(v) = std::env::var(env_name) {
@@ -266,6 +269,7 @@ async fn async_main() -> anyhow::Result<()> {
     let config = Config {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
+        peer_jwt_token: secrets.peer_jwt_token,
         compute_hook_url: args.compute_hook_url,
         max_offline_interval: args
             .max_offline_interval
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 780f4a7ee5..453e96bad3 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -288,6 +288,9 @@ pub struct Config {
     // This JWT token will be used to authenticate this service to the control plane.
     pub control_plane_jwt_token: Option<String>,
 
+    // This JWT token will be used to authenticate with other storage controller instances
+    pub peer_jwt_token: Option<String>,
+
     /// Where the compute hook should send notifications of pageserver attachment locations
     /// (this URL points to the control plane in prod). If this is None, the compute hook will
     /// assume it is running in a test environment and try to update neon_local.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 95c35e9641..94d71a7677 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2144,6 +2144,8 @@ def test_storage_controller_leadership_transfer(
     port_distributor: PortDistributor,
     step_down_times_out: bool,
 ):
+    neon_env_builder.auth_enabled = True
+
     neon_env_builder.num_pageservers = 3
 
     neon_env_builder.storage_controller_config = {

From beefc7a8108e5af333bc1e453749acf872f18fdd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Aug 2024 19:47:42 +0100
Subject: [PATCH 31/44] pageserver: add metric
 pageserver_secondary_heatmap_total_size (#8768)

## Problem

We don't have a convenient way for a human to ask "how far are secondary
downloads along for this tenant".

This is useful when driving migrations of tenants to the storage
controller, as we first create a secondary location and want to see it
warm up before we cut over. That can already be done via storcon_cli,
but we would like a way that doesn't require direct API access.

## Summary of changes

Add a metric that reports to total size of layers in the heatmap: this
may be used in conjunction with the existing
`pageserver_secondary_resident_physical_size` to estimate "warmth" of
the secondary location.
---
 pageserver/src/metrics.rs                     |  9 +++++++++
 pageserver/src/tenant/secondary.rs            | 10 ++++++++++
 pageserver/src/tenant/secondary/downloader.rs |  6 ++++++
 3 files changed, 25 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index cd2cd43f27..1bc9352256 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,15 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_heatmap_total_size",
+        "The total size in bytes of all layers in the most recently downloaded heatmap.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 3132a28b12..1331c07d05 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
     context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
+    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
 };
 
@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {
 
     // Sum of layer sizes on local disk
     pub(super) resident_size_metric: UIntGauge,
+
+    // Sum of layer sizes in the most recently downloaded heatmap
+    pub(super) heatmap_total_size_metric: UIntGauge,
 }
 
 impl Drop for SecondaryTenant {
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
         let tenant_id = self.tenant_shard_id.tenant_id.to_string();
         let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
         let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
     }
 }
 
@@ -128,6 +133,10 @@ impl SecondaryTenant {
             .get_metric_with_label_values(&[&tenant_id, &shard_id])
             .unwrap();
 
+        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
         Arc::new(Self {
             tenant_shard_id,
             // todo: shall we make this a descendent of the
@@ -145,6 +154,7 @@ impl SecondaryTenant {
             progress: std::sync::Mutex::default(),
 
             resident_size_metric,
+            heatmap_total_size_metric,
         })
     }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8cff1d2864..90e1c01dbd 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
             layers_downloaded: 0,
             bytes_downloaded: 0,
         };
+
+        // Also expose heatmap bytes_total as a metric
+        self.secondary_state
+            .heatmap_total_size_metric
+            .set(heatmap_stats.bytes);
+
         // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
         let mut delete_layers = Vec::new();
         let mut delete_timelines = Vec::new();

From c8b9116a97e047a5f349e69fda1fe96790797820 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 20 Aug 2024 15:05:33 -0400
Subject: [PATCH 32/44] impr(pageserver): abort on fatal I/O writer error
 (#8777)

part of https://github.com/neondatabase/neon/issues/8140

The blob writer path now uses `maybe_fatal_err`

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/virtual_file.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b4695e5f40..c0017280fd 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -756,11 +756,23 @@ impl VirtualFile {
         })
     }
 
+    /// The function aborts the process if the error is fatal.
     async fn write_at<B: IoBuf + Send>(
         &self,
         buf: FullSlice<B>,
         offset: u64,
         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (FullSlice<B>, Result<usize, Error>) {
+        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
+        let result = result.maybe_fatal_err("write_at");
+        (slice, result)
+    }
+
+    async fn write_at_inner<B: IoBuf + Send>(
+        &self,
+        buf: FullSlice<B>,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     ) -> (FullSlice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,

From 6d8572ded607e6cb583ff6b9a1690ceecce5a407 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 21 Aug 2024 09:08:49 +0200
Subject: [PATCH 33/44] Benchmarking: need to checkout actions to download Neon
 artifacts (#8770)

## Problem

Database preparation workflow needs Neon artifacts but does not checkout
necessary download action.

We were lucke in a few runs like this one

https://github.com/neondatabase/neon/actions/runs/10413970941/job/28870668020

but this is flaky and a race condition which failed here


https://github.com/neondatabase/neon/actions/runs/10446395644/job/28923749772#step:4:1



## Summary of changes

Checkout code (including actions) before invoking download action

Successful test run
https://github.com/neondatabase/neon/actions/runs/10469356296/job/28992200694
---
 .github/workflows/_benchmarking_preparation.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index 7229776cd6..a52e43b4da 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -48,6 +48,8 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
 
+    - uses: actions/checkout@v4
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:

From 21b684718e1e3e18e687d095d79322c5db9a3992 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Aug 2024 12:55:01 +0200
Subject: [PATCH 34/44] pageserver: add counter for wait time on background
 loop semaphore (#8769)

## Problem

Compaction jobs and other background loops are concurrency-limited
through a global semaphore.

The current counters allow quantifying how _many_ tasks are waiting.
But there is no way to tell how _much_ delay is added by the semaphore.

So, add a counter that aggregates the wall clock time seconds spent
acquiring the semaphore.

The metrics can be used as follows:

* retroactively calculate average acquisition time in a given time range
* compare the degree of background loop backlog among pageservers

The metric is insufficient to calculate

* run-up of ongoing acquisitions that haven't finished acquiring yet
* Not easily feasible because ["Cancelling a call to acquire makes you
lose your place in the
queue"](https://docs.rs/tokio/latest/tokio/sync/struct.Semaphore.html#method.acquire)

## Summary of changes

* Refactor the metrics to follow the current best practice for typed
metrics in `metrics.rs`.
* Add the new counter.
---
 pageserver/src/metrics.rs      | 69 +++++++++++++++++++++++++++++-----
 pageserver/src/tenant/tasks.rs | 11 +-----
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1bc9352256..0a1a22b6e8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1862,16 +1862,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_background_loop_semaphore_wait_start_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls started",
-        "pageserver_background_loop_semaphore_wait_finish_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-        &["task"],
-    )
-    .unwrap()
-});
+pub struct BackgroundLoopSemaphoreMetrics {
+    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
+    durations: EnumMap<BackgroundLoopKind, Counter>,
+}
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
+    || {
+        let counters = register_int_counter_pair_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap();
+
+        let durations = register_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_duration_seconds",
+            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+            &["task"],
+        )
+        .unwrap();
+
+        BackgroundLoopSemaphoreMetrics {
+            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                counters.with_label_values(&[kind.into()])
+            })),
+            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                durations.with_label_values(&[kind.into()])
+            })),
+        }
+    },
+);
+
+impl BackgroundLoopSemaphoreMetrics {
+    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
+        struct Record<'a> {
+            metrics: &'a BackgroundLoopSemaphoreMetrics,
+            task: BackgroundLoopKind,
+            _counter_guard: metrics::IntCounterPairGuard,
+            start: Instant,
+        }
+        impl Drop for Record<'_> {
+            fn drop(&mut self) {
+                let elapsed = self.start.elapsed().as_secs_f64();
+                self.metrics.durations[self.task].inc_by(elapsed);
+            }
+        }
+        Record {
+            metrics: self,
+            task,
+            _counter_guard: self.counters[task].guard(),
+            start: Instant::now(),
+        }
+    }
+}
 
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
@@ -2553,6 +2601,7 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
+use crate::tenant::tasks::BackgroundLoopKind;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 3972685a8e..12f080f3c1 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -61,21 +61,12 @@ impl BackgroundLoopKind {
     }
 }
 
-static PERMIT_GAUGES: once_cell::sync::Lazy<
-    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
-> = once_cell::sync::Lazy::new(|| {
-    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
-    }))
-});
-
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = PERMIT_GAUGES[loop_kind].guard();
+    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
 
     pausable_failpoint!(
         "initial-size-calculation-permit-pause",

From 477246f42cf984015d654521174fff763f9e1263 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 14:28:27 +0300
Subject: [PATCH 35/44] storcon: handle heartbeater shutdown gracefully (#8767)

if a heartbeat happens during shutdown, then the task is already
cancelled and will not be sending responses.

Fixes: #8766
---
 storage_controller/src/heartbeater.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 1bb9c17f30..c0e27bafdb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -87,9 +87,12 @@ impl Heartbeater {
                 pageservers,
                 reply: sender,
             })
-            .unwrap();
+            .map_err(|_| HeartbeaterError::Cancel)?;
 
-        receiver.await.unwrap()
+        receiver
+            .await
+            .map_err(|_| HeartbeaterError::Cancel)
+            .and_then(|x| x)
     }
 }
 

From 3b8016488efc4cecee1a956285b3365162706894 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 14:51:08 +0300
Subject: [PATCH 36/44] test: test_timeline_ancestor_detach_errors rare
 allowed_error (#8782)

Add another allowed_error for this rarity.

Fixes: #8773
---
 test_runner/regress/test_timeline_detach_ancestor.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 82fc26126d..d152d0f41f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -639,8 +639,12 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
 
     for ps in pageservers.values():
         ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
-        ps.allowed_errors.append(
-            ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing"
+        ps.allowed_errors.extend(
+            [
+                ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing",
+                # rare error logging, which is hard to reproduce without instrumenting responding with random sleep
+                '.* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error: Conflict\\("no ancestors"\\)',
+            ]
         )
 
     client = (

From 75175f3628bc88053e13555a3ada8082639b2db6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 21 Aug 2024 14:29:11 +0100
Subject: [PATCH 37/44] CI(build-and-test): run regression tests on arm (#8552)

## Problem

We want to run our regression test suite on ARM.

## Summary of changes
- run regression tests on release ARM builds
- run `build-neon` (including rust tests) on debug ARM builds
- add `arch` parameter to test to distinguish them in the allure report
and in a database
---
 .../actions/run-python-test-set/action.yml    |  4 +---
 .github/workflows/_build-and-test-locally.yml | 15 +++++++++----
 .github/workflows/build_and_test.yml          |  2 +-
 .../ingest_regress_test_result-new-format.py  | 14 ++++++++++++-
 test_runner/fixtures/parametrize.py           | 21 +++++++++++++++++++
 5 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 814067fb8e..6c2cee0971 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -169,10 +169,8 @@ runs:
           EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
         fi
 
-        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
+        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
           cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
-          cov_prefix=()
         else
           cov_prefix=()
         fi
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index af76e51ebc..5e9fff0e6a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,11 +94,16 @@ jobs:
       # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
       # corresponding Cargo.toml files for their descriptions.
       - name: Set env variables
+        env:
+          ARCH: ${{ inputs.arch }}
         run: |
           CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
             CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
             CARGO_FLAGS="--locked --release"
@@ -158,6 +163,8 @@ jobs:
       # Do install *before* running rust tests because they might recompile the
       # binaries with different features/flags.
       - name: Install rust binaries
+        env:
+          ARCH: ${{ inputs.arch }}
         run: |
           # Install target binaries
           mkdir -p /tmp/neon/bin/
@@ -172,7 +179,7 @@ jobs:
           done
 
           # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
             # Keep bloated coverage data files away from the rest of the artifact
             mkdir -p /tmp/coverage/
 
@@ -243,8 +250,8 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    # Run test on x64 only
-    if: inputs.arch == 'x64'
+    # Don't run regression tests on debug arm64 builds
+    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
     needs: [ build-neon ]
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
     container:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 715f1af656..1e7f3598c2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        arch: [ x64 ]
+        arch: [ x64, arm64 ]
         # Do not build or run tests in debug for release branches
         build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
         include:
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index cff1d9875f..40d7254e00 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -18,6 +18,7 @@ import psycopg2
 from psycopg2.extras import execute_values
 
 CREATE_TABLE = """
+CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN');
 CREATE TABLE IF NOT EXISTS results (
     id           BIGSERIAL PRIMARY KEY,
     parent_suite TEXT NOT NULL,
@@ -28,6 +29,7 @@ CREATE TABLE IF NOT EXISTS results (
     stopped_at   TIMESTAMPTZ NOT NULL,
     duration     INT NOT NULL,
     flaky        BOOLEAN NOT NULL,
+    arch         arch DEFAULT 'X64',
     build_type   TEXT NOT NULL,
     pg_version   INT NOT NULL,
     run_id       BIGINT NOT NULL,
@@ -35,7 +37,7 @@ CREATE TABLE IF NOT EXISTS results (
     reference    TEXT NOT NULL,
     revision     CHAR(40) NOT NULL,
     raw          JSONB COMPRESSION lz4 NOT NULL,
-    UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id)
+    UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
 );
 """
 
@@ -50,6 +52,7 @@ class Row:
     stopped_at: datetime
     duration: int
     flaky: bool
+    arch: str
     build_type: str
     pg_version: int
     run_id: int
@@ -121,6 +124,14 @@ def ingest_test_result(
         raw.pop("labels")
         raw.pop("extra")
 
+        # All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py
+        parameters = {
+            p["name"].removeprefix("__"): p["value"]
+            for p in test["parameters"]
+            if p["name"].startswith("__")
+        }
+        arch = parameters.get("arch", "UNKNOWN").strip("'")
+
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
         row = Row(
@@ -132,6 +143,7 @@ def ingest_test_result(
             stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
             duration=test["time"]["duration"],
             flaky=test["flaky"] or test["retriesStatusChange"],
+            arch=arch,
             build_type=build_type,
             pg_version=pg_version,
             run_id=run_id,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 0227285822..92c98763e3 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Optional
 
+import allure
 import pytest
 import toml
 from _pytest.python import Metafunc
@@ -91,3 +92,23 @@ def pytest_generate_tests(metafunc: Metafunc):
         and (platform := os.getenv("PLATFORM")) is not None
     ):
         metafunc.parametrize("platform", [platform.lower()])
+
+
+@pytest.hookimpl(hookwrapper=True, tryfirst=True)
+def pytest_runtest_makereport(*args, **kwargs):
+    # Add test parameters to Allue report to distinguish the same tests with different parameters.
+    # Names has `__` prefix to avoid conflicts with `pytest.mark.parametrize` parameters
+
+    # A mapping between `uname -m` and `RUNNER_ARCH` values.
+    # `RUNNER_ARCH` environment variable is set on GitHub Runners,
+    # possible values are X86, X64, ARM, or ARM64.
+    # See https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables
+    uname_m = {
+        "aarch64": "ARM64",
+        "arm64": "ARM64",
+        "x86_64": "X64",
+    }.get(os.uname().machine, "UNKNOWN")
+    arch = os.getenv("RUNNER_ARCH", uname_m)
+    allure.dynamic.parameter("__arch", arch)
+
+    yield

From 428b105dde089d402b1de035a8cb43ebea930583 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 21 Aug 2024 14:45:32 +0100
Subject: [PATCH 38/44] remove workspace hack from libs (#8780)

This removes workspace hack from all libs, not from any binaries. This
does not change the behaviour of the hack.

Running
```
cargo clean
cargo build --release --bin proxy
```

Before this change took 5m16s. After this change took 3m3s. This is
because this allows the build to be parallelisable much more.
---
 .config/hakari.toml                    | 28 ++++++++++++++++++++++----
 Cargo.lock                             | 18 -----------------
 libs/compute_api/Cargo.toml            |  2 --
 libs/consumption_metrics/Cargo.toml    |  4 +---
 libs/desim/Cargo.toml                  |  2 --
 libs/metrics/Cargo.toml                |  2 --
 libs/pageserver_api/Cargo.toml         |  4 +---
 libs/postgres_backend/Cargo.toml       |  1 -
 libs/postgres_connection/Cargo.toml    |  2 --
 libs/postgres_ffi/Cargo.toml           |  2 --
 libs/postgres_ffi/wal_craft/Cargo.toml |  2 --
 libs/pq_proto/Cargo.toml               |  4 +---
 libs/remote_storage/Cargo.toml         |  3 ++-
 libs/safekeeper_api/Cargo.toml         |  2 --
 libs/tenant_size_model/Cargo.toml      |  2 --
 libs/tracing-utils/Cargo.toml          |  2 --
 libs/utils/Cargo.toml                  |  4 ++--
 libs/walproposer/Cargo.toml            |  2 --
 workspace_hack/Cargo.toml              |  3 ---
 19 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/.config/hakari.toml b/.config/hakari.toml
index 9913ecc9c0..b5990d090e 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,10 +23,30 @@ platforms = [
 ]
 
 [final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
+workspace-members = [
+    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+    # from depending on workspace-hack because most of the dependencies are not used.
+    "vm_monitor",
+    # All of these exist in libs and are not usually built independently.
+    # Putting workspace hack there adds a bottleneck for cargo builds.
+    "compute_api",
+    "consumption_metrics",
+    "desim",
+    "metrics",
+    "pageserver_api",
+    "postgres_backend",
+    "postgres_connection",
+    "postgres_ffi",
+    "pq_proto",
+    "remote_storage",
+    "safekeeper_api",
+    "tenant_size_model",
+    "tracing-utils",
+    "utils",
+    "wal_craft",
+    "walproposer",
+]
 
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
diff --git a/Cargo.lock b/Cargo.lock
index dee15b6aa7..a506da8c02 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1208,7 +1208,6 @@ dependencies = [
  "serde_json",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -1321,7 +1320,6 @@ dependencies = [
  "serde",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -1670,7 +1668,6 @@ dependencies = [
  "smallvec",
  "tracing",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -3147,7 +3144,6 @@ dependencies = [
  "rand 0.8.5",
  "rand_distr",
  "twox-hash",
- "workspace_hack",
 ]
 
 [[package]]
@@ -3791,7 +3787,6 @@ dependencies = [
  "strum_macros",
  "thiserror",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4193,7 +4188,6 @@ dependencies = [
  "tokio-rustls 0.25.0",
  "tokio-util",
  "tracing",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4206,7 +4200,6 @@ dependencies = [
  "postgres",
  "tokio-postgres",
  "url",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4229,7 +4222,6 @@ dependencies = [
  "serde",
  "thiserror",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4267,7 +4259,6 @@ dependencies = [
  "thiserror",
  "tokio",
  "tracing",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4832,7 +4823,6 @@ dependencies = [
  "toml_edit 0.19.10",
  "tracing",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -5357,7 +5347,6 @@ dependencies = [
  "serde",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -6193,7 +6182,6 @@ dependencies = [
  "anyhow",
  "serde",
  "serde_json",
- "workspace_hack",
 ]
 
 [[package]]
@@ -6794,7 +6782,6 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7012,7 +6999,6 @@ dependencies = [
  "url",
  "uuid",
  "walkdir",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7091,7 +7077,6 @@ dependencies = [
  "postgres_ffi",
  "regex",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7112,7 +7097,6 @@ dependencies = [
  "bindgen",
  "postgres_ffi",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7669,8 +7653,6 @@ dependencies = [
  "tokio",
  "tokio-rustls 0.24.0",
  "tokio-util",
- "toml_datetime",
- "toml_edit 0.19.10",
  "tonic",
  "tower",
  "tracing",
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
index b377bd2cce..8aaa481f8c 100644
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,5 +14,3 @@ regex.workspace = true
 
 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
-
-workspace_hack.workspace = true
diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml
index 3f290821c2..a40b74b952 100644
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,10 +6,8 @@ license = "Apache-2.0"
 
 [dependencies]
 anyhow.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
index 6f442d8243..0c4be90267 100644
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,5 +14,3 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
-
-workspace_hack.workspace = true
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index 0bd804051c..f87e7b8e3a 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,8 +12,6 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true
 
-workspace_hack.workspace = true
-
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 3bba89c76d..cb28359ac3 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,11 +21,9 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index c7611b9f21..f6854328fc 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,7 +18,6 @@ tokio-rustls.workspace = true
 tracing.workspace = true
 
 pq_proto.workspace = true
-workspace_hack.workspace = true
 
 [dev-dependencies]
 once_cell.workspace = true
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index fbfea80ae2..19027d13ff 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,7 +11,5 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 once_cell.workspace = true
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 86e72f6bdd..ee69878f69 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,8 +19,6 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 0edc642402..29dd01a936 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,8 +14,6 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 8afabe670e..66bbe03ebc 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,9 +11,7 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["io-util"] }
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 414bce1b26..02adee058f 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-workspace_hack.workspace = true
+
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 327d98ee77..e1f4bcca46 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,5 +9,3 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index 15e78932a8..8aa3c54f62 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,5 +9,3 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 512a748124..5ea8db6b42 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index ec05f849cf..6e593eeac1 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -39,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit.workspace = true
+toml_edit = { workspace = true, features = ["serde"] }
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -54,7 +54,6 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
-workspace_hack.workspace = true
 
 const_format.workspace = true
 
@@ -71,6 +70,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
 
 [[bench]]
 name = "benchmarks"
diff --git a/libs/walproposer/Cargo.toml b/libs/walproposer/Cargo.toml
index 73aa073c44..2d442dc429 100644
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,8 +9,6 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true
 
-workspace_hack.workspace = true
-
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 2d9b372654..20693ad63d 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -80,8 +80,6 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
-toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
-toml_edit = { version = "0.19", features = ["serde"] }
 tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
@@ -124,7 +122,6 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
-toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From f4b3c317f394cb7f82c8c52754b290903957e85d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 19 Aug 2024 16:34:04 -0500
Subject: [PATCH 39/44] Add compute_logical_snapshot_files metric

Track the number of logical snapshot files on an endpoint over time.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vm-image-spec.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 41d6e11725..8c1c4512b4 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -312,6 +312,22 @@ files:
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
+      - metric_name: compute_logical_snapshot_files
+        type: guage
+        help: 'Number of snapshot files in pg_logical/snapshot'
+        key_labels:
+          - tenant_id
+          - timeline_id
+        values: [num_logical_snapshot_files]
+        query: |
+          SELECT
+            (SELECT setting FROM pg_settings WHERE name = 'neon.tenant_id') AS tenant_id,
+            (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+            -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
+            -- temporary snapshot files are renamed to the actual snapshot files after they are
+            -- completely built. We only WAL-log the completely built snapshot files.
+            (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+
       # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
       # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
 

From d919770c55b2a70fd0b19c888d3673b6fef2f889 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 21 Aug 2024 17:30:42 +0300
Subject: [PATCH 40/44] safekeeper: add listing timelines

Adds endpoint GET /tenant/timeline listing all not deleted timelines.
---
 safekeeper/src/http/routes.rs            | 13 +++++++++++++
 test_runner/fixtures/common_types.py     | 15 ++++++++++++++-
 test_runner/fixtures/safekeeper/http.py  |  8 +++++++-
 test_runner/regress/test_wal_acceptor.py |  4 ++++
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index c9defb0bcf..d11815f6ef 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -114,6 +114,16 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
     })
 }
 
+/// List all (not deleted) timelines.
+async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
+        .iter()
+        .map(|tli| tli.ttid)
+        .collect();
+    json_response(StatusCode::OK, res)
+}
+
 /// Report info about timeline.
 async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -562,6 +572,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
         })
+        .get("/v1/tenant/timeline", |r| {
+            request_span(r, timeline_list_handler)
+        })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             request_span(r, timeline_status_handler)
         })
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index b63dfd4e47..7cadcbb4c2 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -1,7 +1,7 @@
 import random
 from dataclasses import dataclass
 from functools import total_ordering
-from typing import Any, Type, TypeVar, Union
+from typing import Any, Dict, Type, TypeVar, Union
 
 T = TypeVar("T", bound="Id")
 
@@ -147,6 +147,19 @@ class TimelineId(Id):
         return self.id.hex()
 
 
+@dataclass
+class TenantTimelineId:
+    tenant_id: TenantId
+    timeline_id: TimelineId
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> "TenantTimelineId":
+        return TenantTimelineId(
+            tenant_id=TenantId(d["tenant_id"]),
+            timeline_id=TimelineId(d["timeline_id"]),
+        )
+
+
 # Workaround for compat with python 3.9, which does not have `typing.Self`
 TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
 
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index a51b89744b..dd3a0a3d54 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import requests
 
-from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 
@@ -144,6 +144,12 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def timeline_list(self) -> List[TenantTimelineId]:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/timeline")
+        res.raise_for_status()
+        resj = res.json()
+        return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
+
     def timeline_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5d3b263936..bb3b16f3e1 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -254,6 +254,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
     assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn
     assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn
 
+    # Test timeline_list endpoint.
+    http_cli = env.safekeepers[0].http_client()
+    assert len(http_cli.timeline_list()) == 3
+
 
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up

From b83d722369f1cb1d9a55ab8d39c36f30b0886ea4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 19:22:47 +0300
Subject: [PATCH 41/44] test: fix more flaky due to graceful shutdown (#8787)

Going through the list of recent flaky tests, trying to fix those
related to graceful shutdown.

- test_forward_compatibility: flush and wait for uploads to avoid
graceful shutdown
- test_layer_bloating: in the end the endpoint and vanilla are still up
=> immediate shutdown
- test_lagging_sk: pageserver shutdown is not related to the test =>
immediate shutdown
- test_lsn_lease_size: pageserver flushing is not needed => immediate
shutdown

Additionally:
- remove `wait_for_upload` usage from workload fixture

Cc: #8708
Fixes: #8710
---
 test_runner/fixtures/neon_fixtures.py      | 14 +++---------
 test_runner/fixtures/workload.py           |  7 +++---
 test_runner/regress/test_compatibility.py  | 12 ++++------
 test_runner/regress/test_import.py         |  5 ++---
 test_runner/regress/test_layer_bloating.py | 26 +++++++++++++---------
 test_runner/regress/test_tenant_size.py    |  3 +++
 test_runner/regress/test_wal_acceptor.py   |  2 ++
 7 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9aa275d343..2bb698f175 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -61,8 +61,6 @@ from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_lay
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
-    wait_for_upload,
-    wait_for_upload_queue_empty,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -5347,9 +5345,7 @@ def last_flush_lsn_upload(
     for tenant_shard_id, pageserver in shards:
         ps_http = pageserver.http_client(auth_token=auth_token)
         wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
-        # force a checkpoint to trigger upload
-        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
-        wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
+        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True)
     return last_flush_lsn
 
 
@@ -5434,9 +5430,5 @@ def generate_uploads_and_deletions(
         # ensures that the pageserver is in a fully idle state: there will be no more
         # background ingest, no more uploads pending, and therefore no non-determinism
         # in subsequent actions like pageserver restarts.
-        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id)
-        # Finish uploads
-        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
-        # Finish all remote writes (including deletions)
-        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index cc93762175..065a78bf9b 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -10,7 +10,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 
 # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
 # to ensure we don't do that: this enables running lots of Workloads in parallel safely.
@@ -174,8 +174,9 @@ class Workload:
 
                 if upload:
                     # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
-                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    ps_http.timeline_checkpoint(
+                        tenant_shard_id, self.timeline_id, wait_until_uploaded=True
+                    )
                     log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
                 else:
                     log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 30ff40b7df..de27191945 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -9,14 +9,12 @@ from typing import List, Optional
 
 import pytest
 import toml
-from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, flush_ep_to_pageserver
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
@@ -122,11 +120,9 @@ def test_create_snapshot(
     timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
 
     pageserver_http = env.pageserver.http_client()
-    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+    flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
 
     env.endpoints.stop_all()
     for sk in env.safekeepers:
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 4dae9176b8..4385cfca76 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -18,7 +18,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
@@ -144,7 +143,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
 
     # Wait for data to land in s3
     wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn))
-    wait_for_upload(client, tenant, timeline, Lsn(end_lsn))
+    client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True)
 
     # Check it worked
     endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant)
@@ -290,7 +289,7 @@ def _import(
 
     # Wait for data to land in s3
     wait_for_last_record_lsn(client, tenant, timeline, lsn)
-    wait_for_upload(client, tenant, timeline, lsn)
+    client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True)
 
     # Check it worked
     endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant, lsn=lsn)
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index 77dc8a35b5..b8126395fd 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -1,27 +1,31 @@
 import os
-import time
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    NeonEnv,
+    NeonEnvBuilder,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
 
 
-def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
-    env = neon_simple_env
-
-    if env.pg_version != PgVersion.V16:
+def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    if neon_env_builder.pg_version != PgVersion.V16:
         pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
 
-    timeline = env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_logical_replication", config_lines=["log_statement=all"]
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "compaction_threshold": 99999,
+            "image_creation_threshold": 99999,
+        }
     )
 
+    timeline = env.initial_timeline
+    endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"])
+
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
@@ -54,7 +58,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
     # Wait logical replication to sync
     logical_replication_sync(vanilla_pg, endpoint)
     wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
-    time.sleep(10)
+    env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False)
 
     # Check layer file sizes
     timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/"
@@ -63,3 +67,5 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
         if filename.startswith("00000"):
             log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")
             assert os.path.getsize(timeline_path + filename) < 512_000_000
+
+    env.stop(immediate=True)
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index b1ade77a14..f872116a1c 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -757,6 +757,9 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
 
     assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res)
 
+    # we are writing a lot, and flushing all of that to disk is not important for this test
+    env.stop(immediate=True)
+
 
 def insert_with_action(
     env: NeonEnv,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index bb3b16f3e1..19df834b81 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1300,6 +1300,8 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
     # Check that WALs are the same.
     cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
 
+    env.stop(immediate=True)
+
 
 # Smaller version of test_one_sk_down testing peer recovery in isolation: that
 # it works without compute at all.

From 99c19cad24b5bb5974403a1e2541fe28ac4c0d53 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 16 Aug 2024 12:44:12 -0500
Subject: [PATCH 42/44] Add compute_receive_lsn metric

Useful for dashboarding the replication metrics of a single endpoint.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vm-image-spec.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 8c1c4512b4..d1cfbda15d 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -272,6 +272,19 @@ files:
               else (pg_current_wal_lsn() - '0/0')::FLOAT8
             end as lsn;
 
+      - metric_name: compute_receive_lsn
+        type: gauge
+        help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
+        key_labels:
+        values: [lsn]
+        query: |
+          SELECT
+            CASE
+              WHEN pg_catalog.pg_is_in_recovery()
+              THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+              ELSE 0
+            END AS lsn;
+
       - metric_name: replication_delay_bytes
         type: gauge
         help: 'Bytes between received and replayed LSN'

From 04752dfa757472062cb70f0fa1fa2e5ccff89225 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 21 Aug 2024 11:15:18 -0500
Subject: [PATCH 43/44] Prefix current_lsn with compute_

---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index d1cfbda15d..622004b931 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -259,7 +259,7 @@ files:
           from
             (values ('5m'),('15m'),('1h')) as t (x);
 
-      - metric_name: current_lsn
+      - metric_name: compute_current_lsn
         type: gauge
         help: 'Current LSN of the database'
         key_labels:

From 07b7c63975fbfaf60f28176b275c4d57e28a8e04 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 21:26:27 +0300
Subject: [PATCH 44/44] test: avoid some too long shutdowns by flushing before
 shutdown (#8772)

After #8655, we needed to mark some tests to shut down immediately. To
aid these tests, try the new pattern of `flush_ep_to_pageserver`
followed by a non-compacting checkpoint. This moves the general graceful
shutdown problem of having too much to flush at shutdown into the test.
Also, add logging for how long the graceful shutdown took, if we got to
complete it for faster log eyeballing.

Fixes: #8712
Cc: #8715, #8708
---
 pageserver/src/lib.rs                         |  7 +++++-
 .../pagebench/test_ondemand_download_churn.py | 17 +++++++-------
 test_runner/performance/test_layer_map.py     | 23 +++++++++++--------
 test_runner/regress/test_combocid.py          | 20 +++++++++++++---
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5829a1c188..dbfc9f3544 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
 ) {
     use std::time::Duration;
 
+    let started_at = std::time::Instant::now();
+
     // If the orderly shutdown below takes too long, we still want to make
     // sure that all walredo processes are killed and wait()ed on by us, not systemd.
     //
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
     walredo_extraordinary_shutdown_thread.join().unwrap();
     info!("walredo_extraordinary_shutdown_thread done");
 
-    info!("Shut down successfully completed");
+    info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        "Shut down successfully completed"
+    );
     std::process::exit(exit_code);
 }
 
diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
index 0348b08f04..9ad6e7907c 100644
--- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -5,8 +5,12 @@ from typing import Any, Dict, Tuple
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    flush_ep_to_pageserver,
+)
 from fixtures.remote_storage import s3_storage
 from fixtures.utils import humantime_to_ms
 
@@ -62,9 +66,6 @@ def test_download_churn(
 
     run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
-
 
 def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     remote_storage_kind = s3_storage()
@@ -98,9 +99,9 @@ def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)})  as i",
             options="-c statement_timeout=0",
         )
-        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
-    # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
-    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
+
+    client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True)
 
     return env
 
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 890b70b9fc..bc6d9de346 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -1,20 +1,21 @@
 import time
 
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
-#
-# Benchmark searching the layer map, when there are a lot of small layer files.
-#
 def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
-    env = neon_env_builder.init_start()
+    """Benchmark searching the layer map, when there are a lot of small layer files."""
+
+    env = neon_env_builder.init_configs()
     n_iters = 10
     n_records = 100000
 
+    env.start()
+
     # We want to have a lot of lot of layer files to exercise the layer map. Disable
     # GC, and make checkpoint_distance very small, so that we get a lot of small layer
     # files.
-    tenant, _ = env.neon_cli.create_tenant(
+    tenant, timeline = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
             "checkpoint_distance": "16384",
@@ -24,8 +25,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         }
     )
 
-    env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
-    endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant)
     cur = endpoint.connect().cursor()
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
@@ -33,9 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         time.sleep(1)
 
     cur.execute("vacuum t")
+
     with zenbenchmark.record_duration("test_query"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
+    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        tenant, timeline, compact=False, wait_until_uploaded=True
+    )
diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py
index 6d2567b7ee..41907b1f20 100644
--- a/test_runner/regress/test_combocid.py
+++ b/test_runner/regress/test_combocid.py
@@ -1,4 +1,4 @@
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
 def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
@@ -34,7 +34,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
 
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -43,6 +43,10 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
     assert len(rows) == 500
 
     cur.execute("rollback")
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
 
 
 def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
@@ -92,7 +96,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
     cur.execute("delete from t")
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -102,6 +106,11 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
 
     cur.execute("rollback")
 
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
+
 
 def test_combocid(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
@@ -137,3 +146,8 @@ def test_combocid(neon_env_builder: NeonEnvBuilder):
     assert cur.rowcount == n_records
 
     cur.execute("rollback")
+
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )