fix vendor/revisions.json

Bump vendor/postrges
Add test_pg_waldump.py
2026-02-01 01:30:38 +00:00 · 2024-05-01 19:50:14 +01:00 · 2024-05-01 19:50:14 +01:00 · 2024-05-01 19:50:14 +01:00 · 2024-05-01 16:31:59 +00:00 · 2024-05-01 11:44:42 -04:00
52 changed files with 1971 additions and 525 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1319,6 +1319,7 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
+ "humantime-serde",
 "hyper 0.14.26",
 "nix 0.27.1",
 "once_cell",
@@ -3184,6 +3185,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3520,6 +3531,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -5095,8 +5112,11 @@ dependencies = [
 "hex",
 "histogram",
 "itertools",
+ "native-tls",
 "pageserver",
 "pageserver_api",
+ "postgres-native-tls",
+ "postgres_ffi",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -5105,8 +5125,10 @@ dependencies = [
 "serde_with",
 "thiserror",
 "tokio",
+ "tokio-postgres",
 "tokio-rustls 0.25.0",
 "tokio-stream",
+ "tokio-util",
 "tracing",
 "tracing-appender",
 "tracing-subscriber",
@@ -6507,6 +6529,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/18
+++ b/18
@@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	# macOS with brew-installed openssl requires explicit paths
-	# It can be configured with OPENSSL_PREFIX variable
-	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	ifndef DISABLE_HOMEBREW
+		# macOS with brew-installed openssl requires explicit paths
+		# It can be configured with OPENSSL_PREFIX variable
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	endif
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,6 +17,7 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
+humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1554,8 +1554,8 @@ fn cli() -> Command {
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .subcommand(Command::new("start").about("Start storage controller"))
+                .subcommand(Command::new("stop").about("Stop storage controller")
                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,6 +17,7 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
+use std::time::Duration;
 use utils::{
    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -66,6 +67,10 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

+    // Configuration for the storage controller (1 per neon_local environment)
+    #[serde(default)]
+    pub storage_controller: NeonStorageControllerConf,
+
    /// This Vec must always contain at least one pageserver
    pub pageservers: Vec<PageServerConf>,

@@ -98,6 +103,29 @@ pub struct NeonBroker {
    pub listen_addr: SocketAddr,
 }

+/// Broker config for cluster internal communication.
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct NeonStorageControllerConf {
+    /// Heartbeat timeout before marking a node offline
+    #[serde(with = "humantime_serde")]
+    pub max_unavailable: Duration,
+}
+
+impl NeonStorageControllerConf {
+    // Use a shorter pageserver unavailability interval than the default to speed up tests.
+    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
+        std::time::Duration::from_secs(10);
+}
+
+impl Default for NeonStorageControllerConf {
+    fn default() -> Self {
+        Self {
+            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+        }
+    }
+}
+
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
    fn default() -> Self {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,4 +1,7 @@
-use crate::{background_process, local_env::LocalEnv};
+use crate::{
+    background_process,
+    local_env::{LocalEnv, NeonStorageControllerConf},
+};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
@@ -32,15 +35,13 @@ pub struct StorageController {
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
+    config: NeonStorageControllerConf,
 }

 const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

-// Use a shorter pageserver unavailability interval than the default to speed up tests.
-const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -135,6 +136,7 @@ impl StorageController {
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
+            config: env.storage_controller.clone(),
        }
    }

@@ -272,8 +274,6 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

-        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
-
        let mut args = vec![
            "-l",
            &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
-            &max_unavailable.to_string(),
+            &humantime::Duration::from(self.config.max_unavailable).to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.

-The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.

 The `diesel` crate is used for defining models & migrations.

--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,10 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

+/// A wrapper type for sparse keyspaces.
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct SparseKeySpace(pub KeySpace);
+
 /// Represents a contiguous half-open range of the keyspace, masked according to a particular
 /// ShardNumber's stripes: within this range of keys, only some "belong" to the current
 /// shard.
@@ -435,10 +439,33 @@ pub struct KeyPartitioning {
    pub parts: Vec<KeySpace>,
 }

+/// Represents a partitioning of the sparse key space.
+#[derive(Clone, Debug, Default)]
+pub struct SparseKeyPartitioning {
+    pub parts: Vec<SparseKeySpace>,
+}
+
 impl KeyPartitioning {
    pub fn new() -> Self {
        KeyPartitioning { parts: Vec::new() }
    }
+
+    /// Convert a key partitioning to a sparse partition.
+    pub fn into_sparse(self) -> SparseKeyPartitioning {
+        SparseKeyPartitioning {
+            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
+        }
+    }
+}
+
+impl SparseKeyPartitioning {
+    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
+    /// cause long/dead loops.
+    pub fn into_dense(self) -> KeyPartitioning {
+        KeyPartitioning {
+            parts: self.parts.into_iter().map(|x| x.0).collect(),
+        }
+    }
 }

 ///
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,9 +1,11 @@
 use utils::lsn::Lsn;

+use crate::keyspace::SparseKeySpace;
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
    pub keys: crate::keyspace::KeySpace,
-
+    pub sparse_keys: crate::keyspace::SparseKeySpace,
    pub at_lsn: Lsn,
 }

@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
        let mut map = serializer.serialize_map(Some(2))?;
        map.serialize_key("keys")?;
        map.serialize_value(&KeySpace(&self.keys))?;
+        map.serialize_key("sparse_keys")?;
+        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
        map.serialize_key("at_lsn")?;
        map.serialize_value(&WithDisplay(&self.at_lsn))?;
        map.end()
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        #[derive(serde::Deserialize)]
        struct De {
            keys: KeySpace,
+            sparse_keys: KeySpace,
            #[serde_as(as = "serde_with::DisplayFromStr")]
            at_lsn: Lsn,
        }
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        Ok(Self {
            at_lsn: de.at_lsn,
            keys: de.keys.0,
+            sparse_keys: SparseKeySpace(de.sparse_keys.0),
        })
    }
 }
@@ -133,6 +139,12 @@ mod tests {
                "030000000000000000000000000000000003"
              ]
            ],
+            "sparse_keys": [
+              [
+                "620000000000000000000000000000000000",
+                "620000000000000000000000000000000003"
+              ]
+            ],
            "at_lsn": "0/2240160"
        }
        "#;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -538,24 +538,6 @@ impl ShardIdentity {
        }
    }

-    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
-    ///
-    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
-    /// as a symptom of that issue.
-    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
-        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
-            return false;
-        }
-
-        let mut hash = murmurhash32(key.field4);
-        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
-        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
-
-        // The key may be affected by issue #7454: it is an initfork and it would not
-        // have mapped to shard 0 until we fixed that issue.
-        mapped_shard != ShardNumber(0)
-    }
-
    /// Return true if the key should be discarded if found in this shard's
    /// data store, e.g. during compaction after a split.
    ///
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,11 +2,10 @@

 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
-use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{channel, Receiver, Sender};
+use tokio::sync::watch::{self, channel};
 use tokio::time::timeout;

 /// An error happened while waiting for a number
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
    fn cnt_value(&self) -> V;
 }

-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
+/// Heap of waiters, lowest numbers pop first.
+struct Waiters<V>
 where
-    S: MonotonicCounter<V>,
    V: Ord,
 {
-    waiters: BinaryHeap<Waiter<V>>,
-    current: S,
-    shutdown: bool,
+    heap: BinaryHeap<Waiter<V>>,
+    /// Number of the first waiter in the heap, or None if there are no waiters.
+    status_channel: watch::Sender<Option<V>>,
+}
+
+impl<V> Waiters<V>
+where
+    V: Ord + Copy,
+{
+    fn new() -> Self {
+        Waiters {
+            heap: BinaryHeap::new(),
+            status_channel: channel(None).0,
+        }
+    }
+
+    /// `status_channel` contains the number of the first waiter in the heap.
+    /// This function should be called whenever waiters heap changes.
+    fn update_status(&self) {
+        let first_waiter = self.heap.peek().map(|w| w.wake_num);
+        let _ = self.status_channel.send_replace(first_waiter);
+    }
+
+    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
+    fn add(&mut self, num: V) -> watch::Receiver<()> {
+        let (tx, rx) = channel(());
+        self.heap.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
+        self.update_status();
+        rx
+    }
+
+    /// Pop all waiters <= num from the heap. Collect channels in a vector,
+    /// so that caller can wake them up.
+    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
+        let mut wake_these = Vec::new();
+        while let Some(n) = self.heap.peek() {
+            if n.wake_num > num {
+                break;
+            }
+            wake_these.push(self.heap.pop().unwrap().wake_channel);
+        }
+        self.update_status();
+        wake_these
+    }
+
+    /// Used on shutdown to efficiently drop all waiters.
+    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
+        let heap = mem::take(&mut self.heap);
+        self.update_status();
+        heap
+    }
 }

 struct Waiter<T>
 where
    T: Ord,
 {
-    wake_num: T,              // wake me when this number arrives ...
-    wake_channel: Sender<()>, // ... by sending a message to this channel
+    wake_num: T,                     // wake me when this number arrives ...
+    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
 }

 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {

 impl<T: Ord> Eq for Waiter<T> {}

+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
+where
+    S: MonotonicCounter<V>,
+    V: Ord,
+{
+    waiters: Waiters<V>,
+    current: S,
+    shutdown: bool,
+}
+
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -108,7 +168,7 @@ where
    /// Create a new `SeqWait`, initialized to a particular number
    pub fn new(starting_num: S) -> Self {
        let internal = SeqWaitInt {
-            waiters: BinaryHeap::new(),
+            waiters: Waiters::new(),
            current: starting_num,
            shutdown: false,
        };
@@ -128,9 +188,8 @@ where
            // Block any future waiters from starting
            internal.shutdown = true;

-            // This will steal the entire waiters map.
-            // When we drop it all waiters will be woken.
-            mem::take(&mut internal.waiters)
+            // Take all waiters to drop them later.
+            internal.waiters.take_all()

            // Drop the lock as we exit this scope.
        };
@@ -196,7 +255,7 @@ where

    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
        let mut internal = self.internal.lock().unwrap();
        if internal.current.cnt_value() >= num {
            return Ok(None);
@@ -205,12 +264,8 @@ where
            return Err(SeqWaitError::Shutdown);
        }

-        // Create a new channel.
-        let (tx, rx) = channel(());
-        internal.waiters.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
+        // Add waiter channel to the queue.
+        let rx = internal.waiters.add(num);
        // Drop the lock as we exit this scope.
        Ok(Some(rx))
    }
@@ -231,16 +286,8 @@ where
            }
            internal.current.cnt_advance(num);

-            // Pop all waiters <= num from the heap. Collect them in a vector, and
-            // wake them up after releasing the lock.
-            let mut wake_these = Vec::new();
-            while let Some(n) = internal.waiters.peek() {
-                if n.wake_num > num {
-                    break;
-                }
-                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
-            }
-            wake_these
+            // Pop all waiters <= num from the heap.
+            internal.waiters.pop_leq(num)
        };

        for tx in wake_these {
@@ -255,6 +302,23 @@ where
    pub fn load(&self) -> S {
        self.internal.lock().unwrap().current
    }
+
+    /// Get a Receiver for the current status.
+    ///
+    /// The current status is the number of the first waiter in the queue,
+    /// or None if there are no waiters.
+    ///
+    /// This receiver will be notified whenever the status changes.
+    /// It is useful for receiving notifications when the first waiter
+    /// starts waiting for a number, or when there are no more waiters left.
+    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
+        self.internal
+            .lock()
+            .unwrap()
+            .waiters
+            .status_channel
+            .subscribe()
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,10 +10,10 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

+#[derive(Debug, thiserror::Error)]
+pub enum BasebackupError {
+    #[error("basebackup pageserver error {0:#}")]
+    Server(#[from] anyhow::Error),
+    #[error("basebackup client error {0:#}")]
+    Client(#[source] io::Error),
+}
+
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    ctx: &'a RequestContext,
-) -> anyhow::Result<()>
+) -> Result<(), BasebackupError>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -92,8 +100,10 @@ where

    // Consolidate the derived and the provided prev_lsn values
    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) {
-            ensure!(backup_prev == provided_prev_lsn);
+        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
+            return Err(BasebackupError::Server(anyhow!(
+                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
+            )));
        }
        provided_prev_lsn
    } else {
@@ -159,15 +169,26 @@ where
        }
    }

-    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
        let (kind, segno, _) = key_to_slru_block(*key)?;

        match kind {
            SlruKind::Clog => {
-                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid SlruKind::Clog record: block.len()={}",
+                        block.len()
+                    )));
+                }
            }
            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                ensure!(block.len() == BLCKSZ as usize);
+                if block.len() != BLCKSZ as usize {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid {:?} record: block.len()={}",
+                        kind,
+                        block.len()
+                    )));
+                }
            }
        }

@@ -194,12 +215,15 @@ where
        Ok(())
    }

-    async fn flush(&mut self) -> anyhow::Result<()> {
+    async fn flush(&mut self) -> Result<(), BasebackupError> {
        let nblocks = self.buf.len() / BLCKSZ as usize;
        let (kind, segno) = self.current_segment.take().unwrap();
        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar.append(&header, self.buf.as_slice()).await?;
+        self.ar
+            .append(&header, self.buf.as_slice())
+            .await
+            .map_err(BasebackupError::Client)?;

        self.total_blocks += nblocks;
        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -209,7 +233,7 @@ where
        Ok(())
    }

-    async fn finish(mut self) -> anyhow::Result<()> {
+    async fn finish(mut self) -> Result<(), BasebackupError> {
        let res = if self.current_segment.is_none() || self.buf.is_empty() {
            Ok(())
        } else {
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
        // TODO include checksum

        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -262,7 +286,8 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await?
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
                .partition(
                    self.timeline.get_shard_identity(),
                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -271,10 +296,15 @@ where
            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);

            for part in slru_partitions.parts {
-                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
+                let blocks = self
+                    .timeline
+                    .get_vectored(part, self.lsn, self.ctx)
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;

                for (key, block) in blocks {
-                    slru_builder.add_block(&key, block?).await?;
+                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    slru_builder.add_block(&key, block).await?;
                }
            }
            slru_builder.finish().await?;
@@ -282,8 +312,11 @@ where

        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in
-            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        for ((spcnode, dbnode), has_relmap_file) in self
+            .timeline
+            .list_dbdirs(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;

@@ -292,7 +325,8 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -300,20 +334,7 @@ where
                if rel.forknum == INIT_FORKNUM {
                    // I doubt we need _init fork itself, but having it at least
                    // serves as a marker relation is unlogged.
-                    if let Err(_e) = self.add_rel(rel, rel).await {
-                        if self
-                            .timeline
-                            .get_shard_identity()
-                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
-                        {
-                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
-                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
-                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
-                            // recreate.
-                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
-                            continue;
-                        }
-                    };
+                    self.add_rel(rel, rel).await?;
                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                    continue;
                }
@@ -328,7 +349,12 @@ where
                }
            }

-            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
+            for (path, content) in self
+                .timeline
+                .list_aux_files(self.lsn, self.ctx)
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
+            {
                if path.starts_with("pg_replslot") {
                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                    let restart_lsn = Lsn(u64::from_le_bytes(
@@ -359,34 +385,41 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await?
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
        {
            self.add_twophase_file(xid).await?;
        }

        fail_point!("basebackup-before-control-file", |_| {
-            bail!("failpoint basebackup-before-control-file")
+            Err(BasebackupError::Server(anyhow!(
+                "failpoint basebackup-before-control-file"
+            )))
        });

        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file().await?;
-        self.ar.finish().await?;
+        self.ar.finish().await.map_err(BasebackupError::Client)?;
        debug!("all tarred up!");
        Ok(())
    }

    /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
        let nblocks = self
            .timeline
            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await?;
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;

        // If the relation is empty, create an empty file
        if nblocks == 0 {
            let file_name = dst.to_segfile_name(0);
            let header = new_tar_header(&file_name, 0)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;
            return Ok(());
        }

@@ -401,13 +434,17 @@ where
                let img = self
                    .timeline
                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
-                    .await?;
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
                segment_data.extend_from_slice(&img[..]);
            }

            let file_name = dst.to_segfile_name(seg as u32);
            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar.append(&header, segment_data.as_slice()).await?;
+            self.ar
+                .append(&header, segment_data.as_slice())
+                .await
+                .map_err(BasebackupError::Client)?;

            seg += 1;
            startblk = endblk;
@@ -427,20 +464,22 @@ where
        spcnode: u32,
        dbnode: u32,
        has_relmap_file: bool,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), BasebackupError> {
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;

-            ensure!(
-                img.len()
-                    == dispatch_pgversion!(
-                        self.timeline.pg_version,
-                        pgv::bindings::SIZEOF_RELMAPFILE
-                    )
-            );
+            if img.len()
+                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
+            {
+                return Err(BasebackupError::Server(anyhow!(
+                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
+                    img.len(),
+                )));
+            }

            Some(img)
        } else {
@@ -453,14 +492,20 @@ where
                ver => format!("{ver}\x0A"),
            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes()).await?;
+            self.ar
+                .append(&header, pg_version_str.as_bytes())
+                .await
+                .map_err(BasebackupError::Client)?;

            info!("timeline.pg_version {}", self.timeline.pg_version);

            if let Some(img) = relmap_img {
                // filenode map for global tablespace
                let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
            } else {
                warn!("global/pg_filenode.map is missing");
            }
@@ -479,18 +524,26 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await?
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?
                    .is_empty()
            {
                return Ok(());
            }
            // User defined tablespaces are not supported
-            ensure!(spcnode == DEFAULTTABLESPACE_OID);
+            if spcnode != DEFAULTTABLESPACE_OID {
+                return Err(BasebackupError::Server(anyhow!(
+                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
+                )));
+            }

            // Append dir path for each database
            let path = format!("base/{}", dbnode);
            let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;

            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -500,11 +553,17 @@ where
                    ver => format!("{ver}\x0A"),
                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes()).await?;
+                self.ar
+                    .append(&header, pg_version_str.as_bytes())
+                    .await
+                    .map_err(BasebackupError::Client)?;

                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
            }
        };
        Ok(())
@@ -513,11 +572,12 @@ where
    //
    // Extract twophase state files
    //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await?;
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -525,7 +585,10 @@ where
        buf.put_u32_le(crc);
        let path = format!("pg_twophase/{:>08X}", xid);
        let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..]).await?;
+        self.ar
+            .append(&header, &buf[..])
+            .await
+            .map_err(BasebackupError::Client)?;

        Ok(())
    }
@@ -534,24 +597,28 @@ where
    // Add generated pg_control file and bootstrap WAL segment.
    // Also send zenith.signal file with extra bootstrap data.
    //
-    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
            if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")?;
+                write!(zenith_signal, "PREV LSN: none")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
            } else {
-                write!(zenith_signal, "PREV LSN: invalid")?;
+                write!(zenith_signal, "PREV LSN: invalid")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
            }
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+                .map_err(|e| BasebackupError::Server(e.into()))?;
        }
        self.ar
            .append(
                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                zenith_signal.as_bytes(),
            )
-            .await?;
+            .await
+            .map_err(BasebackupError::Client)?;

        let checkpoint_bytes = self
            .timeline
@@ -573,7 +640,10 @@ where

        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..]).await?;
+        self.ar
+            .append(&header, &pg_control_bytes[..])
+            .await
+            .map_err(BasebackupError::Client)?;

        //send wal segment
        let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -588,8 +658,16 @@ where
            self.lsn,
        )
        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
-        self.ar.append(&header, &wal_seg[..]).await?;
+        if wal_seg.len() != WAL_SEGMENT_SIZE {
+            return Err(BasebackupError::Server(anyhow!(
+                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
+                wal_seg.len()
+            )));
+        }
+        self.ar
+            .append(&header, &wal_seg[..])
+            .await
+            .map_err(BasebackupError::Client)?;
        Ok(())
    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1918,12 +1918,14 @@ async fn timeline_collect_keyspace(
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let keys = timeline
+        let (dense_ks, sparse_ks) = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
+        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
+        // Therefore, we split dense/sparse keys in this API.
+        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };

        json_response(StatusCode::OK, res)
    }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -48,6 +48,7 @@ use utils::{

 use crate::auth::check_permission;
 use crate::basebackup;
+use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
@@ -1236,6 +1237,13 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        fn map_basebackup_error(err: BasebackupError) -> QueryError {
+            match err {
+                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
+                BasebackupError::Server(e) => QueryError::Other(e),
+            }
+        }
+
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -1261,7 +1269,8 @@ impl PageServerHandler {
        let lsn_awaited_after = started.elapsed();

        // switch client to COPYOUT
-        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)
+            .map_err(QueryError::Disconnected)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
@@ -1276,7 +1285,8 @@ impl PageServerHandler {
                full_backup,
                ctx,
            )
-            .await?;
+            .await
+            .map_err(map_basebackup_error)?;
        } else {
            let mut writer = pgb.copyout_writer();
            if gzip {
@@ -1297,9 +1307,13 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
                // shutdown the encoder to ensure the gzip footer is written
-                encoder.shutdown().await?;
+                encoder
+                    .shutdown()
+                    .await
+                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
            } else {
                basebackup::send_basebackup_tarball(
                    &mut writer,
@@ -1309,11 +1323,13 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
            }
        }

-        pgb.write_message_noflush(&BeMessage::CopyDone)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)
+            .map_err(QueryError::Disconnected)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        let basebackup_after = started
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,6 +23,7 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -730,11 +731,13 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> Result<KeySpace, CollectKeySpaceError> {
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -806,7 +809,12 @@ impl Timeline {
        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
            result.add_key(AUX_FILES_KEY);
        }
-        Ok(result.to_keyspace())
+
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
+        ))
    }

    /// Get cached size of relation if it not updated after specified LSN
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3873,6 +3873,7 @@ mod tests {
    use hex_literal::hex;
    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
+    use pageserver_api::models::CompactionAlgorithm;
    use rand::{thread_rng, Rng};
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4512,11 +4513,23 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
+        timeline: Arc<Timeline>,
+        ctx: &RequestContext,
+        lsn: Lsn,
+        repeat: usize,
+        key_count: usize,
+    ) -> anyhow::Result<()> {
+        let compact = true;
+        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
+    }
+
+    async fn bulk_insert_maybe_compact_gc(
        timeline: Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
        repeat: usize,
        key_count: usize,
+        compact: bool,
    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;
@@ -4557,9 +4570,11 @@ mod tests {
                )
                .await?;
            timeline.freeze_and_flush().await?;
-            timeline
-                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                .await?;
+            if compact {
+                timeline
+                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                    .await?;
+            }
            timeline.gc().await?;
        }

@@ -5042,7 +5057,22 @@ mod tests {

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_random_updates")?;
+        let names_algorithms = [
+            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
+            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_random_updates_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_random_updates_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5107,7 +5137,7 @@ mod tests {
                );
            }

-            // Perform a cycle of flush, compact, and GC
+            // Perform a cycle of flush, and GC
            let cutoff = tline.get_last_record_lsn();
            tline
                .update_gc_info(
@@ -5119,9 +5149,6 @@ mod tests {
                )
                .await?;
            tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
-                .await?;
            tline.gc().await?;
        }

@@ -5402,19 +5429,36 @@ mod tests {

    #[tokio::test]
    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_read_at_max_lsn")?;
+        let names_algorithms = [
+            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
+            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_read_at_max_lsn_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_read_at_max_lsn_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        let compact = false;
+        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;

        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let read_lsn = Lsn(u64::MAX - 1);

-        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
+        let result = tline.get(test_key, read_lsn, &ctx).await;
+        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());

        Ok(())
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -916,6 +916,7 @@ mod tests {
        assert_eq!(lhs, rhs);
    }

+    #[cfg(test)]
    fn brute_force_range_search(
        layer_map: &LayerMap,
        key_range: Range<Key>,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -253,17 +254,15 @@ impl TenantsMap {
    }
 }

+/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
+/// the slower actual deletion in the background.
+///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
-async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
-    let tmp_path = safe_rename_tenant_dir(path).await?;
-    fs::remove_dir_all(tmp_path).await
-}
-
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
    let parent = path
        .as_ref()
@@ -286,6 +285,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
    Ok(tmp_path)
 }

+/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+/// the background, and thereby avoid blocking any API requests on this deletion completing.
+fn spawn_background_purge(tmp_path: Utf8PathBuf) {
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;
+
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
+}
+
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

@@ -570,7 +591,11 @@ pub async fn init_tenant_mgr(
    );
    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

-    // Construct `Tenant` objects and start them running
+    // Accumulate futures for writing tenant configs, so that we can execute in parallel
+    let mut config_write_futs = Vec::new();
+
+    // Update the location configs according to the re-attach response and persist them to disk
+    tracing::info!("Updating {} location configs", tenant_configs.len());
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);

@@ -597,18 +622,22 @@ pub async fn init_tenant_mgr(
        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
            SecondaryLocationConfig { warm: true };

-        // Update the location config according to the re-attach response
        if let Some(tenant_modes) = &tenant_modes {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
            match tenant_modes.get(&tenant_shard_id) {
                None => {
                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                        );
-                    }
+
+                    match safe_rename_tenant_dir(&tenant_dir_path).await {
+                        Ok(tmp_path) => {
+                            spawn_background_purge(tmp_path);
+                        }
+                        Err(e) => {
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
+                        }
+                    };

                    // We deleted local content: move on to next tenant, don't try and spawn this one.
                    continue;
@@ -654,8 +683,32 @@ pub async fn init_tenant_mgr(

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
-        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
+        config_write_futs.push(async move {
+            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            (tenant_shard_id, location_conf, r)
+        });
+    }

+    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
+    tracing::info!(
+        "Writing {} location config files...",
+        config_write_futs.len()
+    );
+    let config_write_results = futures::stream::iter(config_write_futs)
+        .buffer_unordered(16)
+        .collect::<Vec<_>>()
+        .await;
+
+    tracing::info!(
+        "Spawning {} tenant shard locations...",
+        config_write_results.len()
+    );
+    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
+    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
+        // Errors writing configs are fatal
+        config_write_result?;
+
+        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
            LocationMode::Attached(attached_conf) => {
@@ -1699,7 +1752,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1854,28 +1907,6 @@ impl TenantManager {
        shutdown_all_tenants0(self.tenants).await
    }

-    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-    /// the background, and thereby avoid blocking any API requests on this deletion completing.
-    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
-        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-        let task_tenant_id = None;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            task_tenant_id,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
-    }
-
    pub(crate) async fn detach_tenant(
        &self,
        conf: &'static PageServerConf,
@@ -1892,7 +1923,7 @@ impl TenantManager {
                deletion_queue_client,
            )
            .await?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);

        Ok(())
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -597,14 +597,17 @@ impl InMemoryLayer {
        }
    }

-    /// Write this frozen in-memory layer to disk.
+    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
+    /// layer will only contain the key range the user specifies, and may return `None`
+    /// if there are no matching keys.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
    pub(crate) async fn write_to_disk(
        &self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> Result<Option<ResidentLayer>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -618,6 +621,21 @@ impl InMemoryLayer {

        let end_lsn = *self.end_lsn.get().unwrap();

+        let keys: Vec<_> = if let Some(key_range) = key_range {
+            inner
+                .index
+                .iter()
+                .filter(|(k, _)| key_range.contains(k))
+                .map(|(k, m)| (k.to_i128(), m))
+                .collect()
+        } else {
+            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
+        };
+
+        if keys.is_empty() {
+            return Ok(None);
+        }
+
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
@@ -649,6 +667,6 @@ impl InMemoryLayer {

        // MAX is used here because we identify L0 layers by full key range
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
-        Ok(delta_layer)
+        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -401,8 +401,8 @@ impl Layer {
        &self.0.path
    }

-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        &self.0.path_str
+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
    }

    pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -527,8 +527,8 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

-    /// String representation of the full path, used for traversal id.
-    path_str: Arc<str>,
+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,

    desc: PersistentLayerDesc,

@@ -735,7 +735,7 @@ impl LayerInner {

        LayerInner {
            conf,
-            path_str: path.to_string().into(),
+            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
            path,
            desc,
            timeline: Arc::downgrade(timeline),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,7 +17,7 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
-    keyspace::KeySpaceAccum,
+    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
@@ -55,7 +55,6 @@ use std::{
    ops::ControlFlow,
 };

-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -66,6 +65,7 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
+use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -137,6 +137,25 @@ pub(super) enum FlushLoopState {
    Exited,
 }

+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum ImageLayerCreationMode {
+    /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
+    Try,
+    /// Force creating the image layers if possible. For now, no image layers will be created
+    /// for metadata keys. Used in compaction code path with force flag enabled.
+    Force,
+    /// Initial ingestion of the data, and no data should be dropped in this function. This
+    /// means that no metadata keys should be included in the partitions. Used in flush frozen layer
+    /// code path.
+    Initial,
+}
+
+impl std::fmt::Display for ImageLayerCreationMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) struct Hole {
@@ -317,7 +336,7 @@ pub struct Timeline {
    pub initdb_lsn: Lsn,

    /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
+    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,

    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,
@@ -1234,6 +1253,12 @@ impl Timeline {
        self.last_record_lsn.load()
    }

+    /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
+    /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
+    pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
+        self.last_record_lsn.status_receiver()
+    }
+
    pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn.load()
    }
@@ -2104,7 +2129,10 @@ impl Timeline {
                    // initial logical size is 0.
                    LogicalSize::empty_initial()
                },
-                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                partitioning: tokio::sync::Mutex::new((
+                    (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
+                    Lsn(0),
+                )),
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),

@@ -2920,7 +2948,7 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Layer {
    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.local_path_str())
+        Arc::clone(self.debug_str())
    }
 }

@@ -3106,7 +3134,6 @@ impl Timeline {
            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                let layer = guard.get_from_desc(&layer);
                drop(guard);
-
                // Get all the data needed to reconstruct the page version from this layer.
                // But if we have an older cached page image, no need to go past that.
                let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3227,7 +3254,7 @@ impl Timeline {
        Ok(())
    }

-    /// Collect the reconstruct data for a ketspace from the specified timeline.
+    /// Collect the reconstruct data for a keyspace from the specified timeline.
    ///
    /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
    /// the current keyspace. The current keyspace of the search at any given timeline
@@ -3656,66 +3683,103 @@ impl Timeline {
        // files instead. This is possible as long as *all* the data imported into the
        // repository have the same LSN.
        let lsn_range = frozen_layer.get_lsn_range();
-        let (layers_to_upload, delta_layer_to_add) =
-            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        initdb_optimization_count,
-                        ..
-                    } => {
+
+        // Whether to directly create image layers for this flush, or flush them as delta layers
+        let create_image_layer =
+            lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
+
+        #[cfg(test)]
+        {
+            match &mut *self.flush_loop_state.lock().unwrap() {
+                FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                    panic!("flush loop not running")
+                }
+                FlushLoopState::Running {
+                    expect_initdb_optimization,
+                    initdb_optimization_count,
+                    ..
+                } => {
+                    if create_image_layer {
                        *initdb_optimization_count += 1;
-                    }
-                }
-                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
-                // require downloading anything during initial import.
-                let (partitioning, _lsn) = self
-                    .repartition(
-                        self.initdb_lsn,
-                        self.get_compaction_target_size(),
-                        EnumSet::empty(),
-                        ctx,
-                    )
-                    .await?;
-
-                if self.cancel.is_cancelled() {
-                    return Err(FlushLayerError::Cancelled);
-                }
-
-                // For image layers, we add them immediately into the layer map.
-                (
-                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
-                        .await?,
-                    None,
-                )
-            } else {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        expect_initdb_optimization,
-                        ..
-                    } => {
+                    } else {
                        assert!(!*expect_initdb_optimization, "expected initdb optimization");
                    }
                }
-                // Normal case, write out a L0 delta layer file.
-                // `create_delta_layer` will not modify the layer map.
-                // We will remove frozen layer and add delta layer in one atomic operation later.
-                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
-                (
-                    // FIXME: even though we have a single image and single delta layer assumption
-                    // we push them to vec
-                    vec![layer.clone()],
-                    Some(layer),
+            }
+        }
+
+        let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
+            // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+            // require downloading anything during initial import.
+            let ((rel_partition, metadata_partition), _lsn) = self
+                .repartition(
+                    self.initdb_lsn,
+                    self.get_compaction_target_size(),
+                    EnumSet::empty(),
+                    ctx,
                )
+                .await?;
+
+            if self.cancel.is_cancelled() {
+                return Err(FlushLayerError::Cancelled);
+            }
+
+            // For metadata, always create delta layers.
+            let delta_layer = if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single aux file keyspace"
+                );
+                let metadata_keyspace = &metadata_partition.parts[0];
+                assert_eq!(
+                    metadata_keyspace.0.ranges.len(),
+                    1,
+                    "aux file keyspace should be a single range"
+                );
+                self.create_delta_layer(
+                    &frozen_layer,
+                    ctx,
+                    Some(metadata_keyspace.0.ranges[0].clone()),
+                )
+                .await?
+            } else {
+                None
            };

+            // For image layers, we add them immediately into the layer map.
+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &rel_partition,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
+            if let Some(delta_layer) = delta_layer {
+                layers_to_upload.push(delta_layer.clone());
+                (layers_to_upload, Some(delta_layer))
+            } else {
+                (layers_to_upload, None)
+            }
+        } else {
+            // Normal case, write out a L0 delta layer file.
+            // `create_delta_layer` will not modify the layer map.
+            // We will remove frozen layer and add delta layer in one atomic operation later.
+            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+                panic!("delta layer cannot be empty if no filter is applied");
+            };
+            (
+                // FIXME: even though we have a single image and single delta layer assumption
+                // we push them to vec
+                vec![layer.clone()],
+                Some(layer),
+            )
+        };
+
        pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");

        if self.cancel.is_cancelled() {
@@ -3835,12 +3899,18 @@ impl Timeline {
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> anyhow::Result<Option<ResidentLayer>> {
        let self_clone = Arc::clone(self);
        let frozen_layer = Arc::clone(frozen_layer);
        let ctx = ctx.attached_child();
        let work = async move {
-            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            let Some(new_delta) = frozen_layer
+                .write_to_disk(&self_clone, &ctx, key_range)
+                .await?
+            else {
+                return Ok(None);
+            };
            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
            // We just need to fsync the directory in which these inodes are linked,
            // which we know to be the timeline directory.
@@ -3859,7 +3929,7 @@ impl Timeline {
                .sync_all()
                .await
                .fatal_err("VirtualFile::sync_all timeline dir");
-            anyhow::Ok(new_delta)
+            anyhow::Ok(Some(new_delta))
        };
        // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
        // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3886,19 +3956,20 @@ impl Timeline {
        partition_size: u64,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
+    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
            // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
            // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
            // and hence before the compaction task starts.
            anyhow::bail!("repartition() called concurrently, this should not happen");
        };
-        if lsn < partitioning_guard.1 {
+        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
+        if lsn < *partition_lsn {
            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
        }

-        let distance = lsn.0 - partitioning_guard.1 .0;
-        if partitioning_guard.1 != Lsn(0)
+        let distance = lsn.0 - partition_lsn.0;
+        if *partition_lsn != Lsn(0)
            && distance <= self.repartition_threshold
            && !flags.contains(CompactFlags::ForceRepartition)
        {
@@ -3907,13 +3978,18 @@ impl Timeline {
                threshold = self.repartition_threshold,
                "no repartitioning needed"
            );
-            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+            return Ok((
+                (dense_partition.clone(), sparse_partition.clone()),
+                *partition_lsn,
+            ));
        }

-        let keyspace = self.collect_keyspace(lsn, ctx).await?;
-        let partitioning = keyspace.partition(&self.shard_identity, partition_size);
-
-        *partitioning_guard = (partitioning, lsn);
+        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
+        let sparse_partitioning = SparseKeyPartitioning {
+            parts: vec![sparse_ks],
+        }; // no partitioning for metadata keys for now
+        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);

        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
    }
@@ -3969,12 +4045,12 @@ impl Timeline {
        false
    }

-    #[tracing::instrument(skip_all, fields(%lsn, %force))]
+    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
        partitioning: &KeyPartitioning,
        lsn: Lsn,
-        force: bool,
+        mode: ImageLayerCreationMode,
        ctx: &RequestContext,
    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
        let timer = self.metrics.create_images_time_histo.start_timer();
@@ -4011,19 +4087,26 @@ impl Timeline {
        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;

-            let do_it = if force {
-                true
-            } else if check_for_image_layers {
-                // [`Self::time_for_new_image_layer`] is CPU expensive,
-                // so skip if we've not collected enough WAL since the last time
-                self.time_for_new_image_layer(partition, lsn).await
-            } else {
-                false
-            };
-
-            if !do_it {
-                start = img_range.end;
-                continue;
+            if partition.overlaps(&Key::metadata_key_range()) {
+                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
+                // rather big change. Keep this patch small for now.
+                match mode {
+                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
+                        // skip image layer creation anyways for metadata keys.
+                        start = img_range.end;
+                        continue;
+                    }
+                    ImageLayerCreationMode::Initial => {
+                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
+                    }
+                }
+            } else if let ImageLayerCreationMode::Try = mode {
+                // check_for_image_layers = false -> skip
+                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
+                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                    start = img_range.end;
+                    continue;
+                }
            }

            let mut image_layer_writer = ImageLayerWriter::new(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
+use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -102,7 +102,7 @@ impl Timeline {
            )
            .await
        {
-            Ok((partitioning, lsn)) => {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                let image_ctx = RequestContextBuilder::extend(ctx)
                    .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,17 +115,37 @@ impl Timeline {

                // 3. Create new image layers for partitions that have been modified
                // "enough".
-                let layers = self
+                let dense_layers = self
                    .create_image_layers(
-                        &partitioning,
+                        &dense_partitioning,
                        lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
                        &image_ctx,
                    )
                    .await
                    .map_err(anyhow::Error::from)?;

-                self.upload_new_image_layers(layers)?;
+                // For now, nothing will be produced...
+                let sparse_layers = self
+                    .create_image_layers(
+                        &sparse_partitioning.clone().into_dense(),
+                        lsn,
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                assert!(sparse_layers.is_empty());
+
+                self.upload_new_image_layers(dense_layers)?;
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -758,8 +778,9 @@ impl Timeline {
            return Err(CompactionError::ShuttingDown);
        }

-        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
-        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
+        // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));

        pageserver_compaction::compact_tiered::compact_tiered(
            &mut adaptor,
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -22,10 +22,12 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
-use storage_broker::proto::SafekeeperTimelineInfo;
-use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
+};
 use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -89,6 +91,14 @@ pub(super) async fn connection_manager_loop_step(
        .timeline
        .subscribe_for_state_updates();

+    let mut wait_lsn_status = connection_manager_state
+        .timeline
+        .subscribe_for_wait_lsn_updates();
+
+    // TODO: create a separate config option for discovery request interval
+    let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
+    let mut last_discovery_ts: Option<std::time::Instant> = None;
+
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
@@ -97,10 +107,12 @@ pub(super) async fn connection_manager_loop_step(

    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
+        let any_activity = connection_manager_state.wal_connection.is_some()
+            || !connection_manager_state.wal_stream_candidates.is_empty();

        // These things are happening concurrently:
        //
-        // - cancellation request
+        //  - cancellation request
        //  - keep receiving WAL on the current connection
        //      - if the shared state says we need to change connection, disconnect and return
        //      - this runs in a separate task and we receive updates via a watch channel
@@ -108,6 +120,7 @@ pub(super) async fn connection_manager_loop_step(
        //  - receive updates from broker
        //      - this might change the current desired connection
        //  - timeline state changes to something that does not allow walreceiver to run concurrently
+        //  - if there's no connection and no candidates, try to send a discovery request

        // NB: make sure each of the select expressions are cancellation-safe
        // (no need for arms to be cancellation-safe).
@@ -214,6 +227,65 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
+
+            Some(()) = async {
+                // Reminder: this match arm needs to be cancellation-safe.
+                // Calculating time needed to wait until sending the next discovery request.
+                // Current implementation is conservative and sends discovery requests only when there are no candidates.
+
+                if any_activity {
+                    // No need to send discovery requests if there is an active connection or candidates.
+                    return None;
+                }
+
+                // Waiting for an active wait_lsn request.
+                while wait_lsn_status.borrow().is_none() {
+                    if wait_lsn_status.changed().await.is_err() {
+                        // wait_lsn_status channel was closed, exiting
+                        warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
+                        return None;
+                    }
+                }
+
+                // All preconditions met, preparing to send a discovery request.
+                let now = std::time::Instant::now();
+                let next_discovery_ts = last_discovery_ts
+                    .map(|ts| ts + discovery_request_interval)
+                    .unwrap_or_else(|| now);
+
+                if next_discovery_ts > now {
+                    // Prevent sending discovery requests too frequently.
+                    tokio::time::sleep(next_discovery_ts - now).await;
+                }
+
+                let tenant_timeline_id = Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                });
+                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
+                let msg = TypedMessage {
+                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+                    safekeeper_timeline_info: None,
+                    safekeeper_discovery_request: Some(request),
+                    safekeeper_discovery_response: None,
+                    };
+
+                last_discovery_ts = Some(std::time::Instant::now());
+                debug!("No active connection and no candidates, sending discovery request to the broker");
+
+                // Cancellation safety: we want to send a message to the broker, but publish_one()
+                // function can get cancelled by the other select! arm. This is absolutely fine, because
+                // we just want to receive broker updates and discovery is not important if we already
+                // receive updates.
+                //
+                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
+                // This is totally fine because of the reason above.
+
+                // This is a fire-and-forget request, we don't care about the response
+                let _ = broker_client.publish_one(msg).await;
+                debug!("Discovery request sent to the broker");
+                None
+            } => {}
        }

        if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -231,7 +303,7 @@ async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
    id: TenantTimelineId,
    cancel: &CancellationToken,
-) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
+) -> Result<Streaming<TypedMessage>, Cancelled> {
    let mut attempt = 0;
    loop {
        exponential_backoff(
@@ -244,17 +316,27 @@ async fn subscribe_for_timeline_updates(
        attempt += 1;

        // subscribe to the specific timeline
-        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
-            tenant_id: id.tenant_id.as_ref().to_owned(),
-            timeline_id: id.timeline_id.as_ref().to_owned(),
-        });
-        let request = SubscribeSafekeeperInfoRequest {
-            subscription_key: Some(key),
+        let request = SubscribeByFilterRequest {
+            types: vec![
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperTimelineInfo as i32,
+                },
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                },
+            ],
+            tenant_timeline_id: Some(FilterTenantTimelineId {
+                enabled: true,
+                tenant_timeline_id: Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                }),
+            }),
        };

        match {
            tokio::select! {
-                r = broker_client.subscribe_safekeeper_info(request) => { r }
+                r = broker_client.subscribe_by_filter(request) => { r }
                _ = cancel.cancelled() => { return Err(Cancelled); }
            }
        } {
@@ -398,7 +480,7 @@ struct RetryInfo {
 /// Data about the timeline to connect to, received from the broker.
 #[derive(Debug, Clone)]
 struct BrokerSkTimeline {
-    timeline: SafekeeperTimelineInfo,
+    timeline: SafekeeperDiscoveryResponse,
    /// Time at which the data was fetched from the broker last time, to track the stale data.
    latest_update: NaiveDateTime,
 }
@@ -606,7 +688,41 @@ impl ConnectionManagerState {
    }

    /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
+    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
+        let mut is_discovery = false;
+        let timeline_update = match typed_msg.r#type() {
+            MessageType::SafekeeperTimelineInfo => {
+                let info = match typed_msg.safekeeper_timeline_info {
+                    Some(info) => info,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_timeline_info");
+                        return;
+                    }
+                };
+                SafekeeperDiscoveryResponse {
+                    safekeeper_id: info.safekeeper_id,
+                    tenant_timeline_id: info.tenant_timeline_id,
+                    commit_lsn: info.commit_lsn,
+                    safekeeper_connstr: info.safekeeper_connstr,
+                    availability_zone: info.availability_zone,
+                }
+            }
+            MessageType::SafekeeperDiscoveryResponse => {
+                is_discovery = true;
+                match typed_msg.safekeeper_discovery_response {
+                    Some(response) => response,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_discovery_response");
+                        return;
+                    }
+                }
+            }
+            _ => {
+                // unexpected message
+                return;
+            }
+        };
+
        WALRECEIVER_BROKER_UPDATES.inc();

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -619,7 +735,11 @@ impl ConnectionManagerState {
        );

        if old_entry.is_none() {
-            info!("New SK node was added: {new_safekeeper_id}");
+            info!(
+                ?is_discovery,
+                %new_safekeeper_id,
+                "New SK node was added",
+            );
            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }
@@ -818,7 +938,7 @@ impl ConnectionManagerState {
    fn select_connection_candidate(
        &self,
        node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
        self.applicable_connection_candidates()
            .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
            .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -828,7 +948,7 @@ impl ConnectionManagerState {
    /// Some safekeepers are filtered by the retry cooldown.
    fn applicable_connection_candidates(
        &self,
-    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
        let now = Utc::now().naive_utc();

        self.wal_stream_candidates
@@ -968,19 +1088,11 @@ mod tests {
        latest_update: NaiveDateTime,
    ) -> BrokerSkTimeline {
        BrokerSkTimeline {
-            timeline: SafekeeperTimelineInfo {
+            timeline: SafekeeperDiscoveryResponse {
                safekeeper_id: 0,
                tenant_timeline_id: None,
-                term: 0,
-                last_log_term: 0,
-                flush_lsn: 0,
                commit_lsn,
-                backup_lsn: 0,
-                remote_consistent_lsn: 0,
-                peer_horizon_lsn: 0,
-                local_start_lsn: 0,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
-                http_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
            },
            latest_update,
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -79,19 +79,15 @@ pub(super) async fn authenticate(

    // Give user a URL to spawn a new database.
    info!(parent: &span, "sending the auth URL to the user");
-    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
    client
        .write_message_noflush(&Be::AuthenticationOk)?
        .write_message_noflush(&Be::CLIENT_ENCODING)?
        .write_message(&Be::NoticeResponse(&greeting))
        .await?;
-    drop(pause);

    // Wait for web console response (see `mgmt`).
    info!(parent: &span, "waiting for console's reply...");
-    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
    let db_info = waiter.await.map_err(LinkAuthError::from)?;
-    drop(pause);

    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -277,16 +277,15 @@ pub struct ComputeConnectionLatencyGroup {
    protocol: Protocol,
    cold_start_info: ColdStartInfo,
    outcome: ConnectOutcome,
-    component: LatencyComponents,
+    excluded: LatencyExclusions,
 }

 #[derive(FixedCardinalityLabel, Copy, Clone)]
-pub enum LatencyComponents {
+pub enum LatencyExclusions {
    Client,
-    Cplane,
-    Compute,
-    ComputeRetry,
-    Proxy,
+    ClientAndCplane,
+    ClientCplaneCompute,
+    ClientCplaneComputeRetry,
 }

 #[derive(FixedCardinalityLabel, Copy, Clone)]
@@ -446,52 +445,46 @@ impl Drop for LatencyTimer {

        let metric = &Metrics::get().proxy.compute_connection_latency_seconds;

-        // client only latency
+        // Excluding client communication from the accumulated time.
        metric.observe(
            ComputeConnectionLatencyGroup {
                protocol: self.protocol,
                cold_start_info: self.cold_start_info,
                outcome: self.outcome,
-                component: LatencyComponents::Client,
+                excluded: LatencyExclusions::Client,
            },
-            self.accumulated.client.as_secs_f64(),
+            duration
+                .saturating_sub(self.accumulated.client)
+                .as_secs_f64(),
        );

-        // cplane only latency
+        // Exclude client and cplane communication from the accumulated time.
+        let accumulated_total = self.accumulated.client + self.accumulated.cplane;
        metric.observe(
            ComputeConnectionLatencyGroup {
                protocol: self.protocol,
                cold_start_info: self.cold_start_info,
                outcome: self.outcome,
-                component: LatencyComponents::Cplane,
+                excluded: LatencyExclusions::ClientAndCplane,
            },
-            self.accumulated.cplane.as_secs_f64(),
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
        );

-        // compute connect only latency
+        // Exclude client cplane, compue communication from the accumulated time.
+        let accumulated_total =
+            self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
        metric.observe(
            ComputeConnectionLatencyGroup {
                protocol: self.protocol,
                cold_start_info: self.cold_start_info,
                outcome: self.outcome,
-                component: LatencyComponents::Compute,
+                excluded: LatencyExclusions::ClientCplaneCompute,
            },
-            self.accumulated.compute.as_secs_f64(),
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
        );

-        // compute failure retry latency
-        metric.observe(
-            ComputeConnectionLatencyGroup {
-                protocol: self.protocol,
-                cold_start_info: self.cold_start_info,
-                outcome: self.outcome,
-                component: LatencyComponents::ComputeRetry,
-            },
-            self.accumulated.retry.as_secs_f64(),
-        );
-
-        // proxy only latency, removing client+cplane+compute+retry from the total
-        let accumulated = self.accumulated.client
+        // Exclude client cplane, compue, retry communication from the accumulated time.
+        let accumulated_total = self.accumulated.client
            + self.accumulated.cplane
            + self.accumulated.compute
            + self.accumulated.retry;
@@ -500,9 +493,9 @@ impl Drop for LatencyTimer {
                protocol: self.protocol,
                cold_start_info: self.cold_start_info,
                outcome: self.outcome,
-                component: LatencyComponents::Proxy,
+                excluded: LatencyExclusions::ClientCplaneComputeRetry,
            },
-            duration.saturating_sub(accumulated).as_secs_f64(),
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
        );
    }
 }
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,7 +22,12 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
+postgres_ffi.workspace = true
 tokio-stream.workspace = true
+tokio-postgres.workspace = true
+tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted.

 #### `scan-metadata`

-Walk objects in a pageserver S3 bucket, and report statistics on the contents.
+Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
+Errors are logged to stderr and summary to stdout.

+For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
+env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver

 Timelines: 31106
 With errors: 3
@@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
 ```

+For safekeepers, dump_db_connstr and dump_db_table must be
+specified; they should point to table with debug dump which will be used
+to list timelines and find their backup and start LSNs.
+
 ## Cleaning up running pageservers

 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,11 +1,13 @@
-use std::time::Duration;
-
 use chrono::{DateTime, Utc};
+use futures::Future;
 use hex::FromHex;
+
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;

+use tokio_util::sync::CancellationToken;
+use utils::backoff;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -137,7 +139,7 @@ pub struct ProjectData {
    pub region_id: String,
    pub platform_id: String,
    pub user_id: String,
-    pub pageserver_id: u64,
+    pub pageserver_id: Option<u64>,
    #[serde(deserialize_with = "from_nullable_id")]
    pub tenant: TenantId,
    pub safekeepers: Vec<SafekeeperData>,
@@ -155,7 +157,7 @@ pub struct ProjectData {
    pub maintenance_set: Option<String>,
 }

-#[derive(Debug, serde::Deserialize)]
+#[derive(Debug, Clone, serde::Deserialize)]
 pub struct BranchData {
    pub id: BranchId,
    pub created_at: DateTime<Utc>,
@@ -210,30 +212,39 @@ impl CloudAdminApiClient {
            .await
            .expect("Semaphore is not closed");

-        let response = self
-            .http_client
-            .get(self.append_url("/projects"))
-            .query(&[
-                ("tenant_id", tenant_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find project for tenant".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/projects"))
+                    .query(&[
+                        ("tenant_id", tenant_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;
+
+                let response: AdminApiResponse<Vec<ProjectData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_tenant_project",
+        )
+        .await?;

-        let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find project for tenant".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
        match response.data.len() {
            0 => Ok(None),
            1 => Ok(Some(
@@ -261,42 +272,34 @@ impl CloudAdminApiClient {
        const PAGINATION_LIMIT: usize = 512;
        let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
        loop {
-            let response = self
-                .http_client
-                .get(self.append_url("/projects"))
-                .query(&[
-                    ("show_deleted", "false".to_string()),
-                    ("limit", format!("{PAGINATION_LIMIT}")),
-                    ("offset", format!("{pagination_offset}")),
-                ])
-                .header(header::ACCEPT, "application/json")
-                .bearer_auth(&self.token)
-                .send()
-                .await
-                .map_err(|e| {
-                    Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::RequestSend(e),
-                    )
-                })?;
+            let response_bytes = CloudAdminApiClient::with_retries(
+                || async {
+                    let response = self
+                        .http_client
+                        .get(self.append_url("/projects"))
+                        .query(&[
+                            ("show_deleted", "false".to_string()),
+                            ("limit", format!("{PAGINATION_LIMIT}")),
+                            ("offset", format!("{pagination_offset}")),
+                        ])
+                        .header(header::ACCEPT, "application/json")
+                        .bearer_auth(&self.token)
+                        .send()
+                        .await
+                        .map_err(|e| {
+                            Error::new(
+                                "List active projects".to_string(),
+                                ErrorKind::RequestSend(e),
+                            )
+                        })?;

-            match response.status() {
-                StatusCode::OK => {}
-                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
-                    tokio::time::sleep(Duration::from_millis(500)).await;
-                    continue;
-                }
-                _status => {
-                    return Err(Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::ResponseStatus(response.status()),
-                    ))
-                }
-            }
-
-            let response_bytes = response.bytes().await.map_err(|e| {
-                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
-            })?;
+                    response.bytes().await.map_err(|e| {
+                        Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
+                    })
+                },
+                "list_projects",
+            )
+            .await?;

            let decode_result =
                serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -327,6 +330,7 @@ impl CloudAdminApiClient {

    pub async fn find_timeline_branch(
        &self,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<Option<BranchData>, Error> {
        let _permit = self
@@ -335,43 +339,61 @@ impl CloudAdminApiClient {
            .await
            .expect("Semaphore is not closed");

-        let response = self
-            .http_client
-            .get(self.append_url("/branches"))
-            .query(&[
-                ("timeline_id", timeline_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find branch for timeline".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/branches"))
+                    .query(&[
+                        ("timeline_id", timeline_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;

-        let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find branch for timeline".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
-        match response.data.len() {
-            0 => Ok(None),
-            1 => Ok(Some(
-                response
-                    .data
-                    .into_iter()
-                    .next()
-                    .expect("Should have exactly one element"),
-            )),
-            too_many => Err(Error::new(
-                format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
+                let response: AdminApiResponse<Vec<BranchData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_timeline_branch",
+        )
+        .await?;
+
+        let mut branches: Vec<BranchData> = response.data.into_iter().collect();
+        // Normally timeline_id is unique. However, we do have at least one case
+        // of the same timeline_id in two different projects, apparently after
+        // manual recovery. So always recheck project_id (discovered through
+        // tenant_id).
+        let project_data = match self.find_tenant_project(tenant_id).await? {
+            Some(pd) => pd,
+            None => return Ok(None),
+        };
+        branches.retain(|b| b.project_id == project_data.id);
+        if branches.len() < 2 {
+            Ok(branches.first().cloned())
+        } else {
+            Err(Error::new(
+                format!(
+                    "Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
+                    tenant_id,
+                    timeline_id,
+                    branches.len()
+                ),
                ErrorKind::UnexpectedState,
-            )),
+            ))
        }
    }

@@ -532,4 +554,15 @@ impl CloudAdminApiClient {
            .parse()
            .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
    }
+
+    async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
+    where
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, Error>>,
+    {
+        let cancel = CancellationToken::new(); // not really used
+        backoff::retry(op, |_| false, 1, 20, description, &cancel)
+            .await
+            .expect("cancellations are disabled")
+    }
 }
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -60,6 +60,7 @@ pub struct GarbageList {
    /// see garbage, we saw some active tenants too.  This protects against classes of bugs
    /// in the scrubber that might otherwise generate a "deleted all" result.
    active_tenant_count: usize,
+    active_timeline_count: usize,
 }

 impl GarbageList {
@@ -67,6 +68,7 @@ impl GarbageList {
        Self {
            items: Vec::new(),
            active_tenant_count: 0,
+            active_timeline_count: 0,
            node_kind,
            bucket_config,
        }
@@ -119,7 +121,10 @@ pub async fn find_garbage(
 const S3_CONCURRENCY: usize = 32;

 // How many concurrent API requests to make to the console API.
-const CONSOLE_CONCURRENCY: usize = 128;
+//
+// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
+// would be better to implement real rsp limiter.
+const CONSOLE_CONCURRENCY: usize = 16;

 struct ConsoleCache {
    /// Set of tenants found in the control plane API
@@ -221,6 +226,7 @@ async fn find_garbage_inner(
        } else {
            tracing::debug!("Tenant {tenant_shard_id} is active");
            active_tenants.push(tenant_shard_id);
+            garbage.active_tenant_count = active_tenants.len();
        }

        counter += 1;
@@ -261,7 +267,7 @@ async fn find_garbage_inner(
        let api_client = cloud_admin_api_client.clone();
        async move {
            api_client
-                .find_timeline_branch(ttid.timeline_id)
+                .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
                .await
                .map_err(|e| anyhow::anyhow!(e))
                .map(|r| (ttid, r))
@@ -271,15 +277,29 @@ async fn find_garbage_inner(
        std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));

    // Update the GarbageList with any timelines which appear not to exist.
+    let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
    while let Some(result) = timelines_checked.next().await {
        let (ttid, console_result) = result?;
        if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
            tracing::debug!("Timeline {ttid} is garbage");
        } else {
            tracing::debug!("Timeline {ttid} is active");
+            active_timelines.push(ttid);
+            garbage.active_timeline_count = active_timelines.len();
        }
    }

+    let num_garbage_timelines = garbage
+        .items
+        .iter()
+        .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        .count();
+    tracing::info!(
+        "Found {}/{} garbage timelines in active tenants",
+        num_garbage_timelines,
+        active_timelines.len(),
+    );
+
    Ok(garbage)
 }

@@ -344,16 +364,22 @@ pub async fn get_timeline_objects(
 const MAX_KEYS_PER_DELETE: usize = 1000;

 /// Drain a buffer of keys into DeleteObjects requests
+///
+/// If `drain` is true, drains keys completely; otherwise stops when <
+/// MAX_KEYS_PER_DELETE keys are left.
+/// `num_deleted` returns number of deleted keys.
 async fn do_delete(
    s3_client: &Arc<Client>,
    bucket_name: &str,
    keys: &mut Vec<ObjectIdentifier>,
    dry_run: bool,
    drain: bool,
+    progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
    while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
        let request_keys =
            keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
+        let num_deleted = request_keys.len();
        if dry_run {
            tracing::info!("Dry-run deletion of objects: ");
            for k in request_keys {
@@ -368,12 +394,30 @@ async fn do_delete(
                .send()
                .await
                .context("DeleteObjects request")?;
+            progress_tracker.register(num_deleted);
        }
    }

    Ok(())
 }

+/// Simple tracker reporting each 10k deleted keys.
+#[derive(Default)]
+struct DeletionProgressTracker {
+    num_deleted: usize,
+    last_reported_num_deleted: usize,
+}
+
+impl DeletionProgressTracker {
+    fn register(&mut self, n: usize) {
+        self.num_deleted += n;
+        if self.num_deleted - self.last_reported_num_deleted > 10000 {
+            tracing::info!("progress: deleted {} keys", self.num_deleted);
+            self.last_reported_num_deleted = self.num_deleted;
+        }
+    }
+}
+
 pub async fn purge_garbage(
    input_path: String,
    mode: PurgeMode,
@@ -394,6 +438,14 @@ pub async fn purge_garbage(
    if garbage_list.active_tenant_count == 0 {
        anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
    }
+    if garbage_list
+        .items
+        .iter()
+        .any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        && garbage_list.active_timeline_count == 0
+    {
+        anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
+    }

    let filtered_items = garbage_list
        .items
@@ -429,6 +481,7 @@ pub async fn purge_garbage(
        std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));

    let mut objects_to_delete = Vec::new();
+    let mut progress_tracker = DeletionProgressTracker::default();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
        objects_to_delete.append(&mut object_list);
@@ -439,6 +492,7 @@ pub async fn purge_garbage(
                &mut objects_to_delete,
                dry_run,
                false,
+                &mut progress_tracker,
            )
            .await?;
        }
@@ -450,10 +504,11 @@ pub async fn purge_garbage(
        &mut objects_to_delete,
        dry_run,
        true,
+        &mut progress_tracker,
    )
    .await?;

-    tracing::info!("Fell through");
+    tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);

    Ok(())
 }
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,7 +4,8 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
-pub mod scan_metadata;
+pub mod scan_pageserver_metadata;
+pub mod scan_safekeeper_metadata;
 pub mod tenant_snapshot;

 use std::env;
@@ -141,12 +142,17 @@ impl RootTarget {
    pub fn tenants_root(&self) -> S3Target {
        match self {
            Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.with_sub_segment("wal"),
+            Self::Safekeeper(root) => root.clone(),
        }
    }

    pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
-        self.tenants_root().with_sub_segment(&tenant_id.to_string())
+        match self {
+            Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
+            Self::Safekeeper(_) => self
+                .tenants_root()
+                .with_sub_segment(&tenant_id.tenant_id.to_string()),
+        }
    }

    pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
@@ -337,9 +343,7 @@ fn init_remote(
        }),
        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("safekeeper/v1".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
            delimiter,
        }),
    };
@@ -364,7 +368,10 @@ async fn list_objects_with_retries(
        {
            Ok(response) => return Ok(response),
            Err(e) => {
-                error!("list_objects_v2 query failed: {e}");
+                error!(
+                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
+                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                );
                tokio::time::sleep(Duration::from_secs(1)).await;
            }
        }
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,9 +1,13 @@
+use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::scan_metadata::scan_metadata;
+use s3_scrubber::scan_pageserver_metadata::scan_metadata;
 use s3_scrubber::tenant_snapshot::SnapshotDownloader;
-use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
+use s3_scrubber::{
+    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
+    NodeKind, TraversingDepth,
+};

 use clap::{Parser, Subcommand};
 use utils::id::TenantId;
@@ -35,11 +39,20 @@ enum Command {
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
    },
+    #[command(verbatim_doc_comment)]
    ScanMetadata {
+        #[arg(short, long)]
+        node_kind: NodeKind,
        #[arg(short, long, default_value_t = false)]
        json: bool,
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
+        #[arg(long, default_value = None)]
+        /// For safekeeper node_kind only, points to db with debug dump
+        dump_db_connstr: Option<String>,
+        /// For safekeeper node_kind only, table in the db with debug dump
+        #[arg(long, default_value = None)]
+        dump_db_table: Option<String>,
    },
    TenantSnapshot {
        #[arg(long = "tenant-id")]
@@ -72,33 +85,75 @@ async fn main() -> anyhow::Result<()> {
    ));

    match cli.command {
-        Command::ScanMetadata { json, tenant_ids } => {
-            match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                Err(e) => {
-                    tracing::error!("Failed: {e}");
-                    Err(e)
+        Command::ScanMetadata {
+            json,
+            tenant_ids,
+            node_kind,
+            dump_db_connstr,
+            dump_db_table,
+        } => {
+            if let NodeKind::Safekeeper = node_kind {
+                let dump_db_connstr =
+                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
+                let dump_db_table =
+                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
+
+                let summary = scan_safekeeper_metadata(
+                    bucket_config.clone(),
+                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
+                    dump_db_connstr,
+                    dump_db_table,
+                )
+                .await?;
+                if json {
+                    println!("{}", serde_json::to_string(&summary).unwrap())
+                } else {
+                    println!("{}", summary.summary_string());
                }
-                Ok(summary) => {
-                    if json {
-                        println!("{}", serde_json::to_string(&summary).unwrap())
-                    } else {
-                        println!("{}", summary.summary_string());
+                if summary.is_fatal() {
+                    bail!("Fatal scrub errors detected");
+                }
+                if summary.is_empty() {
+                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                    // scrubber they were likely expecting to scan something, and if we see no timelines
+                    // at all then it's likely due to some configuration issues like a bad prefix
+                    bail!(
+                        "No timelines found in bucket {} prefix {}",
+                        bucket_config.bucket,
+                        bucket_config
+                            .prefix_in_bucket
+                            .unwrap_or("<none>".to_string())
+                    );
+                }
+                Ok(())
+            } else {
+                match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                    Err(e) => {
+                        tracing::error!("Failed: {e}");
+                        Err(e)
                    }
-                    if summary.is_fatal() {
-                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                    } else if summary.is_empty() {
-                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                        // scrubber they were likely expecting to scan something, and if we see no timelines
-                        // at all then it's likely due to some configuration issues like a bad prefix
-                        Err(anyhow::anyhow!(
-                            "No timelines found in bucket {} prefix {}",
-                            bucket_config.bucket,
-                            bucket_config
-                                .prefix_in_bucket
-                                .unwrap_or("<none>".to_string())
-                        ))
-                    } else {
-                        Ok(())
+                    Ok(summary) => {
+                        if json {
+                            println!("{}", serde_json::to_string(&summary).unwrap())
+                        } else {
+                            println!("{}", summary.summary_string());
+                        }
+                        if summary.is_fatal() {
+                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                        } else if summary.is_empty() {
+                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                            // scrubber they were likely expecting to scan something, and if we see no timelines
+                            // at all then it's likely due to some configuration issues like a bad prefix
+                            Err(anyhow::anyhow!(
+                                "No timelines found in bucket {} prefix {}",
+                                bucket_config.bucket,
+                                bucket_config
+                                    .prefix_in_bucket
+                                    .unwrap_or("<none>".to_string())
+                            ))
+                        } else {
+                            Ok(())
+                        }
                    }
                }
            }
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -114,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>(
    let timelines_target = target.timelines_root(&tenant);

    loop {
-        tracing::info!("Listing in {}", tenant);
+        tracing::debug!("Listing in {}", tenant);
        let fetch_response =
            list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
                .await;
@@ -151,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>(
        }
    }

-    tracing::info!("Yielding for {}", tenant);
+    tracing::debug!("Yielding for {}", tenant);
    Ok(stream! {
        for i in timeline_ids {
            let id = i?;
--- a/s3_scrubber/src/scan_pageserver_metadata.rs
+++ b/s3_scrubber/src/scan_pageserver_metadata.rs
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -0,0 +1,236 @@
+use std::{collections::HashSet, str::FromStr};
+
+use aws_sdk_s3::Client;
+use futures::stream::{StreamExt, TryStreamExt};
+use pageserver_api::shard::TenantShardId;
+use postgres_ffi::{XLogFileName, PG_TLI};
+use serde::Serialize;
+use tokio_postgres::types::PgLsn;
+use tracing::{error, info, trace};
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::{
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
+
+/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
+const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
+
+#[derive(Serialize)]
+pub struct MetadataSummary {
+    timeline_count: usize,
+    with_errors: HashSet<TenantTimelineId>,
+    deleted_count: usize,
+}
+
+impl MetadataSummary {
+    fn new() -> Self {
+        Self {
+            timeline_count: 0,
+            with_errors: HashSet::new(),
+            deleted_count: 0,
+        }
+    }
+
+    pub fn summary_string(&self) -> String {
+        format!(
+            "timeline_count: {}, with_errors: {}",
+            self.timeline_count,
+            self.with_errors.len()
+        )
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.timeline_count == 0
+    }
+
+    pub fn is_fatal(&self) -> bool {
+        !self.with_errors.is_empty()
+    }
+}
+
+/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
+/// statistics.
+///
+/// It works by listing timelines along with timeline_start_lsn and backup_lsn
+/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
+/// segments are missing, before complaining control plane is queried to check if
+/// the project wasn't deleted in the meanwhile.
+pub async fn scan_safekeeper_metadata(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantId>,
+    dump_db_connstr: String,
+    dump_db_table: String,
+) -> anyhow::Result<MetadataSummary> {
+    info!(
+        "checking bucket {}, region {}, dump_db_table {}",
+        bucket_config.bucket, bucket_config.region, dump_db_table
+    );
+    // Use the native TLS implementation (Neon requires TLS)
+    let tls_connector =
+        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
+    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let tenant_filter_clause = if !tenant_ids.is_empty() {
+        format!(
+            "and tenant_id in ({})",
+            tenant_ids
+                .iter()
+                .map(|t| format!("'{}'", t))
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    } else {
+        "".to_owned()
+    };
+    let query = format!(
+        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
+        dump_db_table, tenant_filter_clause,
+    );
+    info!("query is {}", query);
+    let timelines = client.query(&query, &[]).await?;
+    info!("loaded {} timelines", timelines.len());
+
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let console_config = ConsoleConfig::from_env()?;
+    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
+
+    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
+        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
+        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
+        let timeline_start_lsn_pg: PgLsn = row.get(2);
+        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
+        let backup_lsn_pg: PgLsn = row.get(3);
+        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
+        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
+        check_timeline(
+            &s3_client,
+            &target,
+            &cloud_admin_api_client,
+            ttid,
+            timeline_start_lsn,
+            backup_lsn,
+        )
+    });
+    // Run multiple check_timeline's concurrently.
+    const CONCURRENCY: usize = 32;
+    let mut timelines = checks.try_buffered(CONCURRENCY);
+
+    let mut summary = MetadataSummary::new();
+    while let Some(r) = timelines.next().await {
+        let res = r?;
+        summary.timeline_count += 1;
+        if !res.is_ok {
+            summary.with_errors.insert(res.ttid);
+        }
+        if res.is_deleted {
+            summary.deleted_count += 1;
+        }
+    }
+
+    Ok(summary)
+}
+
+struct TimelineCheckResult {
+    ttid: TenantTimelineId,
+    is_ok: bool,
+    is_deleted: bool, // timeline is deleted in cplane
+}
+
+/// List s3 and check that is has all expected WAL for the ttid. Consistency
+/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
+/// Ok(false) if not, Err if failed to check.
+async fn check_timeline(
+    s3_client: &Client,
+    root: &RootTarget,
+    api_client: &CloudAdminApiClient,
+    ttid: TenantTimelineId,
+    timeline_start_lsn: Lsn,
+    backup_lsn: Lsn,
+) -> anyhow::Result<TimelineCheckResult> {
+    trace!(
+        "checking ttid {}, should contain WAL [{}-{}]",
+        ttid,
+        timeline_start_lsn,
+        backup_lsn
+    );
+    // calculate expected segfiles
+    let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
+    let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
+    let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
+        (expected_first_segno..expected_last_segno)
+            .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
+    );
+    let expected_files_num = expected_segfiles.len();
+    trace!("expecting {} files", expected_segfiles.len(),);
+
+    // now list s3 and check if it misses something
+    let ttshid =
+        TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
+    let mut timeline_dir_target = root.timeline_root(&ttshid);
+    // stream_listing yields only common_prefixes if delimiter is not empty, but
+    // we need files, so unset it.
+    timeline_dir_target.delimiter = String::new();
+
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let obj = obj?;
+        let key = obj.key();
+
+        let seg_name = key
+            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
+            .expect("failed to extract segment name");
+        expected_segfiles.remove(seg_name);
+    }
+    if !expected_segfiles.is_empty() {
+        // Before complaining check cplane, probably timeline is already deleted.
+        let bdata = api_client
+            .find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
+            .await?;
+        let deleted = match bdata {
+            Some(bdata) => bdata.deleted,
+            None => {
+                // note: should be careful with selecting proper cplane address
+                info!("ttid {} not found, assuming it is deleted", ttid);
+                true
+            }
+        };
+        if deleted {
+            // ok, branch is deleted
+            return Ok(TimelineCheckResult {
+                ttid,
+                is_ok: true,
+                is_deleted: true,
+            });
+        }
+        error!(
+            "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
+            ttid,
+            expected_segfiles.len(),
+            expected_files_num,
+            timeline_start_lsn,
+            backup_lsn,
+        );
+        return Ok(TimelineCheckResult {
+            ttid,
+            is_ok: false,
+            is_deleted: false,
+        });
+    }
+    Ok(TimelineCheckResult {
+        ttid,
+        is_ok: true,
+        is_deleted: false,
+    })
+}
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -177,6 +177,10 @@ struct Args {
    /// Controls how long backup will wait until uploading the partial segment.
    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
    partial_backup_timeout: Duration,
+    /// Disable task to push messages to broker every second. Supposed to
+    /// be used in tests.
+    #[arg(long)]
+    disable_periodic_broker_push: bool,
 }

 // Like PathBufValueParser, but allows empty string.
@@ -309,6 +313,7 @@ async fn main() -> anyhow::Result<()> {
        walsenders_keep_horizon: args.walsenders_keep_horizon,
        partial_backup_enabled: args.partial_backup_enabled,
        partial_backup_timeout: args.partial_backup_timeout,
+        disable_periodic_broker_push: args.disable_periodic_broker_push,
    };

    // initialize sentry if SENTRY_DSN is provided
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -10,11 +10,20 @@ use anyhow::Result;
 use storage_broker::parse_proto_ttid;

 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
+use storage_broker::proto::FilterTenantTimelineId;
+use storage_broker::proto::MessageType;
+use storage_broker::proto::SafekeeperDiscoveryResponse;
+use storage_broker::proto::SubscribeByFilterRequest;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+use storage_broker::proto::TypeSubscription;
+use storage_broker::proto::TypedMessage;
 use storage_broker::Request;

+use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
+use std::time::UNIX_EPOCH;
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use tracing::*;
@@ -31,6 +40,12 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;

 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
+    if conf.disable_periodic_broker_push {
+        info!("broker push_loop is disabled, doing nothing...");
+        futures::future::pending::<()>().await; // sleep forever
+        return Ok(());
+    }
+
    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -75,7 +90,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }

 /// Subscribe and fetch all the interesting data from the broker.
-async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
+async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;

    // TODO: subscribe only to local timelines instead of all
@@ -94,6 +109,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);

    while let Some(msg) = stream.message().await? {
+        stats.update_pulled();
+
        let proto_ttid = msg
            .tenant_timeline_id
            .as_ref()
@@ -119,12 +136,93 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    bail!("end of stream");
 }

+/// Process incoming discover requests. This is done in a separate task to avoid
+/// interfering with the normal pull/push loops.
+async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+    let mut client =
+        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+
+    let request = SubscribeByFilterRequest {
+        types: vec![TypeSubscription {
+            r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+        }],
+        tenant_timeline_id: Some(FilterTenantTimelineId {
+            enabled: false,
+            tenant_timeline_id: None,
+        }),
+    };
+
+    let mut stream = client
+        .subscribe_by_filter(request)
+        .await
+        .context("subscribe_by_filter request failed")?
+        .into_inner();
+
+    let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
+
+    while let Some(typed_msg) = stream.message().await? {
+        stats.update_pulled();
+
+        match typed_msg.r#type() {
+            MessageType::SafekeeperDiscoveryRequest => {
+                let msg = typed_msg
+                    .safekeeper_discovery_request
+                    .expect("proto type mismatch from broker message");
+
+                let proto_ttid = msg
+                    .tenant_timeline_id
+                    .as_ref()
+                    .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
+                let ttid = parse_proto_ttid(proto_ttid)?;
+                if let Ok(tli) = GlobalTimelines::get(ttid) {
+                    // we received a discovery request for a timeline we know about
+                    discover_counter.inc();
+
+                    // create and reply with discovery response
+                    let sk_info = tli.get_safekeeper_info(&conf).await;
+                    let response = SafekeeperDiscoveryResponse {
+                        safekeeper_id: sk_info.safekeeper_id,
+                        tenant_timeline_id: sk_info.tenant_timeline_id,
+                        commit_lsn: sk_info.commit_lsn,
+                        safekeeper_connstr: sk_info.safekeeper_connstr,
+                        availability_zone: sk_info.availability_zone,
+                    };
+
+                    // note this is a blocking call
+                    client
+                        .publish_one(TypedMessage {
+                            r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                            safekeeper_timeline_info: None,
+                            safekeeper_discovery_request: None,
+                            safekeeper_discovery_response: Some(response),
+                        })
+                        .await?;
+                }
+            }
+
+            _ => {
+                warn!(
+                    "unexpected message type i32 {}, {:?}",
+                    typed_msg.r#type,
+                    typed_msg.r#type()
+                );
+            }
+        }
+    }
+    bail!("end of stream");
+}
+
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    info!("started, broker endpoint {:?}", conf.broker_endpoint);

    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
    let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
    let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
+    let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
+
+    let stats = Arc::new(BrokerStats::new());
+    let stats_task = task_stats(stats.clone());
+    tokio::pin!(stats_task);

    // Selecting on JoinHandles requires some squats; is there a better way to
    // reap tasks individually?
@@ -153,13 +251,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                    };
                    pull_handle = None;
                },
+                res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
+                    // was it panic or normal error?
+                    match res {
+                        Ok(res_internal) => if let Err(err_inner) = res_internal {
+                            warn!("discover task failed: {:?}", err_inner);
+                        }
+                        Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
+                    };
+                    discover_handle = None;
+                },
                _ = ticker.tick() => {
                    if push_handle.is_none() {
                        push_handle = Some(tokio::spawn(push_loop(conf.clone())));
                    }
                    if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
                    }
+                    if discover_handle.is_none() {
+                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
+                    }
+                },
+                _ = &mut stats_task => {}
+        }
+    }
+}
+
+struct BrokerStats {
+    /// Timestamp of the last received message from the broker.
+    last_pulled_ts: AtomicU64,
+}
+
+impl BrokerStats {
+    fn new() -> Self {
+        BrokerStats {
+            last_pulled_ts: AtomicU64::new(0),
+        }
+    }
+
+    fn now_millis() -> u64 {
+        std::time::SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("time is before epoch")
+            .as_millis() as u64
+    }
+
+    /// Update last_pulled timestamp to current time.
+    fn update_pulled(&self) {
+        self.last_pulled_ts
+            .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+/// Periodically write to logs if there are issues with receiving data from the broker.
+async fn task_stats(stats: Arc<BrokerStats>) {
+    let warn_duration = Duration::from_secs(10);
+    let mut ticker = tokio::time::interval(warn_duration);
+
+    loop {
+        tokio::select! {
+            _ = ticker.tick() => {
+                let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
+                if last_pulled == 0 {
+                    // no broker updates yet
+                    continue;
+                }
+
+                let now = BrokerStats::now_millis();
+                if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
+                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    info!("no broker updates for some time, last update: {:?}", ts);
+                }
            }
        }
    }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -83,6 +83,7 @@ pub struct SafeKeeperConf {
    pub walsenders_keep_horizon: bool,
    pub partial_backup_enabled: bool,
    pub partial_backup_timeout: Duration,
+    pub disable_periodic_broker_push: bool,
 }

 impl SafeKeeperConf {
@@ -129,6 +130,7 @@ impl SafeKeeperConf {
            walsenders_keep_horizon: false,
            partial_backup_enabled: false,
            partial_backup_timeout: Duration::from_secs(0),
+            disable_periodic_broker_push: false,
        }
    }
 }
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,6 +178,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        walsenders_keep_horizon: false,
        partial_backup_enabled: false,
        partial_backup_timeout: Duration::from_secs(0),
+        disable_periodic_broker_push: false,
    };

    let mut global = GlobalMap::new(disk, conf.clone())?;
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -196,8 +196,13 @@ impl SubscriptionKey {

    /// Parse from FilterTenantTimelineId
    pub fn from_proto_filter_tenant_timeline_id(
-        f: &FilterTenantTimelineId,
+        opt: Option<&FilterTenantTimelineId>,
    ) -> Result<Self, Status> {
+        if opt.is_none() {
+            return Ok(SubscriptionKey::All);
+        }
+
+        let f = opt.unwrap();
        if !f.enabled {
            return Ok(SubscriptionKey::All);
        }
@@ -534,10 +539,7 @@ impl BrokerService for Broker {
            .remote_addr()
            .expect("TCPConnectInfo inserted by handler");
        let proto_filter = request.into_inner();
-        let ttid_filter = proto_filter
-            .tenant_timeline_id
-            .as_ref()
-            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
+        let ttid_filter = proto_filter.tenant_timeline_id.as_ref();

        let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
        let types_set = proto_filter
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -90,7 +90,11 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

-pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+/// How long a node may be unresponsive to heartbeats before we declare it offline.
+/// This must be long enough to cover node restarts as well as normal operations: in future
+/// it should be separated into distinct timeouts for startup vs. normal operation
+/// (`<https://github.com/neondatabase/neon/issues/7552>`)
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);

 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;

@@ -4251,7 +4255,9 @@ impl Service {
    /// Check all tenants for pending reconciliation work, and reconcile those in need.
    /// Additionally, reschedule tenants that require it.
    ///
-    /// Returns how many reconciliation tasks were started
+    /// Returns how many reconciliation tasks were started, or `1` if no reconciles were
+    /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
+    /// available.  A return value of 0 indicates that everything is fully reconciled already.
    fn reconcile_all(&self) -> usize {
        let mut locked = self.inner.write().unwrap();
        let (nodes, tenants, _scheduler) = locked.parts_mut();
@@ -4266,7 +4272,11 @@ impl Service {
            }

            // Skip checking if this shard is already enqueued for reconciliation
-            if shard.delayed_reconcile {
+            if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
+                // If there is something delayed, then return a nonzero count so that
+                // callers like reconcile_all_now do not incorrectly get the impression
+                // that the system is in a quiescent state.
+                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
                continue;
            }

@@ -4451,7 +4461,7 @@ impl Service {
            waiter_count
        );

-        Ok(waiter_count)
+        Ok(std::cmp::max(waiter_count, reconciles_spawned))
    }

    pub async fn shutdown(&self) {
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -952,8 +952,8 @@ impl TenantShard {

    /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
    ///
-    /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
-    /// you would like to wait until one gets spawned in the background.
+    /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
+    /// you would like to wait on the next reconciler that gets spawned in the background.
    pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
        self.ensure_sequence_ahead();

--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -14,10 +14,18 @@ class ComputeReconfigure:
        self.server = server
        self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
        self.workloads = {}
+        self.on_notify = None

    def register_workload(self, workload):
        self.workloads[workload.tenant_id] = workload

+    def register_on_notify(self, fn):
+        """
+        Add some extra work during a notification, like sleeping to slow things down, or
+        logging what was notified.
+        """
+        self.on_notify = fn
+

@pytest.fixture(scope="function")
 def compute_reconfigure_listener(make_httpserver):
@@ -43,6 +51,9 @@ def compute_reconfigure_listener(make_httpserver):
        body: dict[str, Any] = request.json
        log.info(f"notify-attach request: {body}")

+        if self.on_notify is not None:
+            self.on_notify(body)
+
        try:
            workload = self.workloads[TenantId(body["tenant_id"])]
        except KeyError:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -499,6 +499,7 @@ class NeonEnvBuilder:
        self.config_init_force: Optional[str] = None
        self.top_output_dir = top_output_dir
        self.control_plane_compute_hook_api: Optional[str] = None
+        self.storage_controller_config: Optional[dict[Any, Any]] = None

        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine

@@ -1021,6 +1022,7 @@ class NeonEnv:
        self.pg_distrib_dir = config.pg_distrib_dir
        self.endpoint_counter = 0
        self.pageserver_config_override = config.pageserver_config_override
+        self.storage_controller_config = config.storage_controller_config

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
@@ -1066,6 +1068,9 @@ class NeonEnv:
        if self.control_plane_compute_hook_api is not None:
            cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api

+        if self.storage_controller_config is not None:
+            cfg["storage_controller"] = self.storage_controller_config
+
        # Create config for pageserver
        http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
        pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1134,12 +1139,9 @@ class NeonEnv:
        # bounce through retries on startup
        self.storage_controller.start()

-        def storage_controller_ready():
-            assert self.storage_controller.ready() is True
-
        # Wait for storage controller readiness to prevent unnecessary post start-up
        # reconcile.
-        wait_until(30, 1, storage_controller_ready)
+        self.storage_controller.wait_until_ready()

        # Start up broker, pageserver and all safekeepers
        futs = []
@@ -2043,6 +2045,15 @@ class NeonStorageController(MetricsGetter):
        else:
            raise RuntimeError(f"Unexpected status {status} from readiness endpoint")

+    def wait_until_ready(self):
+        t1 = time.time()
+
+        def storage_controller_ready():
+            assert self.ready() is True
+
+        wait_until(30, 1, storage_controller_ready)
+        return time.time() - t1
+
    def attach_hook_issue(
        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
    ) -> int:
@@ -2130,7 +2141,7 @@ class NeonStorageController(MetricsGetter):
        shard_count: Optional[int] = None,
        shard_stripe_size: Optional[int] = None,
        tenant_config: Optional[Dict[Any, Any]] = None,
-        placement_policy: Optional[str] = None,
+        placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
    ):
        """
        Use this rather than pageserver_api() when you need to include shard parameters
@@ -2240,10 +2251,21 @@ class NeonStorageController(MetricsGetter):
    def reconcile_until_idle(self, timeout_secs=30):
        start_at = time.time()
        n = 1
+        delay_sec = 0.5
+        delay_max = 5
        while n > 0:
            n = self.reconcile_all()
-            if time.time() - start_at > timeout_secs:
+            if n == 0:
+                break
+            elif time.time() - start_at > timeout_secs:
                raise RuntimeError("Timeout in reconcile_until_idle")
+            else:
+                # Don't call again right away: if we're waiting for many reconciles that
+                # are blocked on the concurrency limit, it slows things down to call
+                # reconcile_all frequently.
+                time.sleep(delay_sec)
+                delay_sec *= 2
+                delay_sec = min(delay_sec, delay_max)

    def consistency_check(self):
        """
@@ -3734,7 +3756,9 @@ class S3Scrubber:
        return stdout

    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
+        stdout = self.scrubber_cli(
+            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
+        )

        try:
            return json.loads(stdout)
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -0,0 +1,198 @@
+import concurrent.futures
+import random
+import time
+
+import pytest
+from fixtures.compute_reconfigure import ComputeReconfigure
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TenantShardId, TimelineId
+
+
+@pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
+def test_storage_controller_many_tenants(
+    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
+):
+    """
+    Check that we cope well with a not-totally-trivial number of tenants.
+
+    This is checking for:
+    - Obvious concurrency bugs from issuing many tenant creations/modifications
+      concurrently.
+    - Obvious scaling bugs like O(N^2) scaling that would be so slow that even
+      a basic test starts failing from slowness.
+
+    This is _not_ a comprehensive scale test: just a basic sanity check that
+    we don't fall over for a thousand shards.
+    """
+
+    neon_env_builder.num_pageservers = 5
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
+        # guard against regressions in restart time.
+        "max_unavailable": "300s"
+    }
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+
+    # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
+    compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
+
+    env = neon_env_builder.init_start()
+
+    # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
+    # of shards are hitting the delayed path.
+    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
+
+    for ps in env.pageservers:
+        # This can happen because when we do a loop over all pageservers and mark them offline/active,
+        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
+        # bumping generation before other attachments are detached.
+        #
+        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
+        # we spawn with a wait for the predecessor.
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+        # Storage controller is allowed to drop pageserver requests when the cancellation token
+        # for a Reconciler fires.
+        ps.allowed_errors.append(".*request was dropped before completing.*")
+
+    # Total tenants
+    tenant_count = 4000
+
+    # Shards per tenant
+    shard_count = 2
+    stripe_size = 1024
+
+    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
+
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    def check_memory():
+        # Shards should be cheap_ in memory, as we will have very many of them
+        expect_memory_per_shard = 128 * 1024
+
+        rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
+        assert rss is not None
+        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
+        assert rss < expect_memory_per_shard * shard_count * tenant_count
+
+    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
+    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
+    rng = random.Random(1234)
+
+    # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
+    # permits, to ensure that we are exercising stressing that.
+    api_concurrency = 135
+
+    # We will create tenants directly via API, not via neon_local, to avoid any false
+    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
+        futs = []
+        t1 = time.time()
+        for tenant_id in tenants:
+            f = executor.submit(
+                env.storage_controller.tenant_create,
+                tenant_id,
+                shard_count,
+                stripe_size,
+                placement_policy={"Attached": 1},
+            )
+            futs.append(f)
+
+        # Wait for creations to finish
+        for f in futs:
+            f.result()
+        log.info(
+            f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
+        )
+
+        run_ops = api_concurrency * 4
+        assert run_ops < len(tenants)
+        op_tenants = list(tenants)[0:run_ops]
+
+        # Generate a mixture of operations and dispatch them all concurrently
+        futs = []
+        for tenant_id in op_tenants:
+            op = rng.choice([0, 1, 2])
+            if op == 0:
+                # A fan-out write operation to all shards in a tenant (timeline creation)
+                f = executor.submit(
+                    virtual_ps_http.timeline_create,
+                    PgVersion.NOT_SET,
+                    tenant_id,
+                    TimelineId.generate(),
+                )
+            elif op == 1:
+                # A reconciler operation: migrate a shard.
+                shard_number = rng.randint(0, shard_count - 1)
+                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+                f = executor.submit(
+                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
+                )
+            elif op == 2:
+                # A passthrough read to shard zero
+                f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
+
+            futs.append(f)
+
+        # Wait for mixed ops to finish
+        for f in futs:
+            f.result()
+
+    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
+    # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
+    # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
+    #
+    # We do not require that the system is quiescent already here, although at present in this point in the test
+    # that may be the case.
+    while True:
+        t1 = time.time()
+        reconcilers = env.storage_controller.reconcile_all()
+        if reconcilers == 0:
+            # Time how long a no-op background reconcile takes: this measures how long it takes to
+            # loop over all the shards looking for work to do.
+            runtime = time.time() - t1
+            log.info(f"No-op call to reconcile_all took {runtime}s")
+            assert runtime < 1
+            break
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    # See how long the controller takes to pass its readiness check.  This should be fast because
+    # all the nodes are online: offline pageservers are the only thing that's allowed to delay
+    # startup.
+    readiness_period = env.storage_controller.wait_until_ready()
+    assert readiness_period < 5
+
+    # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
+    # to run, as it was in a stable state before restart.  If it did, that's a bug.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Restart pageservers: this exercises the /re-attach API
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
+    # as they were not offline long enough to trigger any scheduling changes.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Stop the storage controller before tearing down fixtures, because it otherwise might log
+    # errors trying to call our `ComputeReconfigure`.
+    env.storage_controller.stop()
--- a/test_runner/regress/test_pg_waldump.py
+++ b/test_runner/regress/test_pg_waldump.py
@@ -0,0 +1,46 @@
+import os
+
+from fixtures.neon_fixtures import NeonEnv, PgBin
+from fixtures.utils import subprocess_capture
+
+
+# Simple test to check that pg_waldump works with neon WAL files
+def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pg_waldump", "empty")
+    endpoint = env.endpoints.create_start("test_pg_waldump")
+
+    cur = endpoint.connect().cursor()
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        ROLLBACK;
+    """
+    )
+
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        COMMIT;
+    """
+    )
+
+    # stop the endpoint to make sure that WAL files are flushed and won't change
+    endpoint.stop()
+
+    assert endpoint.pgdata_dir
+    wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001")
+    pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump")
+
+    # use special --ignore option to ignore the validation checks in pg_waldump
+    # this is necessary, because neon WAL files contain gap at the beginning
+    output_path, _, _ = subprocess_capture(test_output_dir, [pg_waldump_path, "--ignore", wal_path])
+
+    with open(f"{output_path}.stdout", "r") as f:
+        stdout = f.read()
+        assert "ABORT" in stdout
+        assert "COMMIT" in stdout
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -928,6 +928,8 @@ def test_sharding_split_failures(
            ".*Reconcile error: receive body: error sending request for url.*",
            # Node offline cases will fail inside reconciler when detaching secondaries
            ".*Reconcile error on shard.*: receive body: error sending request for url.*",
+            # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline
+            ".*Reconcile error.*Cancelled.*",
            # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
            ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
        ]
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

    tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
+    timeline_id = env.neon_cli.create_branch("test_idle_reconnections")

    def collect_stats() -> Dict[str, float]:
        # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
@@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):

    collect_stats()

-    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
+    endpoint = env.endpoints.create_start("test_idle_reconnections")
    # just write something to the timeline
    endpoint.safe_psql("create table t(i int)")
    collect_stats()
@@ -2007,3 +2007,47 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
    )
    log.info(f"dump_control_file response: {res}")
    assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
+
+
+# Test disables periodic pushes from safekeeper to the broker and checks that
+# pageserver can still discover safekeepers with discovery requests.
+def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_broker_discovery")
+
+    endpoint = env.endpoints.create_start(
+        "test_broker_discovery",
+        config_lines=["shared_buffers=1MB"],
+    )
+    endpoint.safe_psql("create table t(i int, payload text)")
+    # Install extension containing function needed to clear buffer
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
+
+    def do_something():
+        time.sleep(1)
+        # generate some data to commit WAL on safekeepers
+        endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
+        # clear the buffers
+        endpoint.safe_psql("select clear_buffer_cache()")
+        # read data to fetch pages from pageserver
+        endpoint.safe_psql("select sum(i) from t")
+
+    do_something()
+    do_something()
+
+    for sk in env.safekeepers:
+        # Disable periodic broker push, so pageserver won't be able to discover
+        # safekeepers without sending a discovery request
+        sk.stop().start(extra_opts=["--disable-periodic-broker-push"])
+
+    do_something()
+    do_something()
+
+    # restart pageserver and check how everything works
+    env.pageserver.stop().start()
+
+    do_something()
+    do_something()
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "261497dd63ace434045058b1453bcbaaa83f23e5",
-  "postgres-v15": "85d809c124a898847a97d66a211f7d5ef4f8e0cb",
-  "postgres-v14": "d9149dc59abcbeeb26293707509aef51752db28f"
+  "postgres-v16": "10fb158d0ec04f17e4076e5e1ce0bae6ba9acb3b",
+  "postgres-v15": "d6c949db38c932d9f29413a185f4dcf0d0b116b9",
+  "postgres-v14": "574f0e509e98a171d11b2a252cf4763b18fc80d6"
 }
Author	SHA1	Message	Date
Anastasia Lubennikova	5ccf32b756	fix vendor/revisions.json	2024-05-01 19:50:14 +01:00
Anastasia Lubennikova	120bd1972f	Bump vendor/postrges	2024-05-01 19:50:14 +01:00
Anastasia Lubennikova	434eea7d11	Add test_pg_waldump.py Simple test to ensure that pg_waldump works with neon WAL files	2024-05-01 19:50:14 +01:00
Alex Chi Z	5558457c84	chore(pageserver): categorize basebackup errors (#7523 ) close https://github.com/neondatabase/neon/issues/7391 ## Summary of changes Categorize basebackup error into two types: server error and client error. This makes it easier to set up alerts. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-01 16:31:59 +00:00
Alex Chi Z	26e6ff8ba6	chore(pageserver): concise error message for layer traversal (#7565 ) Instead of showing the full path of layer traversal, we now only show tenant (in tracing context)+timeline+filename. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-01 11:44:42 -04:00
Arthur Petukhovsky	50a45e67dc	Discover safekeepers via broker request (#7279 ) We had an incident where pageserver requests timed out because pageserver couldn't fetch WAL from safekeepers. This incident was caused by a bug in safekeeper logic for timeline activation, which prevented pageserver from finding safekeepers. This bug was since fixed, but there is still a chance of a similar bug in the future due to overall complexity. We add a new broker message to "signal interest" for timeline. This signal will be sent by pageservers `wait_lsn`, and safekeepers will receive this signal to start broadcasting broker messages. Then every broker subscriber will be able to find the safekeepers and connect to them (to start fetching WAL). This feature is not limited to pageservers and any service that wants to download WAL from safekeepers will be able to use this discovery request. This commit changes pageserver's connection_manager (walreceiver) to send a SafekeeperDiscoveryRequest when there is no information about safekeepers present in memory. Current implementation will send these requests only if there is an active wait_lsn() call and no more often than once per 10 seconds. Add `test_broker_discovery` to test this: safekeepers started with `--disable-periodic-broker-push` will not push info to broker so that pageserver must use a discovery to start fetching WAL. Add task_stats in safekeepers broker module to log a warning if there is no message received from the broker for the last 10 seconds. Closes #5471 --------- Co-authored-by: Christian Schwarz <christian@neon.tech>	2024-04-30 18:50:03 +00:00
Andrew Rudenko	fcbe60f436	Makefile: DISABLE_HOMEBREW variable (#7556 ) ## Problem The current Makefile assumes that homebrew is used on macos. There are other ways to install dependencies on MacOS (nix, macports, "manually"). It would be great to allow the one who wants to use other options to disable homebrew integration. ## Summary of changes It adds DISABLE_HOMEBREW variable that if set skips extra homebrew-specific configuration steps.	2024-04-30 19:44:02 +02:00
John Spray	e018cac1f7	tests: tweak log allow list in test_sharding_split_failures (#7549 ) ## Problem This test became flaky recently with failures like: ``` AssertionError: Log errors on storage_controller: (129, '2024-04-29T16:41:03.591506Z ERROR request{method=PUT path=/control/v1/tenant/b38c0447fbdbcf4e1c023f00b0f7c221/shard_split request_id=34df4975-2ef3-4ed8-b167-2956650e365c}: Error processing HTTP request: InternalServerError(Reconcile error on shard b38c0447fbdbcf4e1c023f00b0f7c221-0002: Cancelled\n') ``` Likely due to #7508 changing how errors are reported from Reconcilers. ## Summary of changes - Tolerate `Reconcile error.*Cancelled` log errors	2024-04-30 18:00:24 +01:00
John Spray	a74b60066c	storage controller: test for large shard counts (#7475 ) ## Problem Storage controller was observed to have unexpectedly large memory consumption when loaded with many thousands of shards. This was recently fixed: - https://github.com/neondatabase/neon/pull/7493 ...but we need a general test that the controller is well behaved with thousands of shards. Closes: https://github.com/neondatabase/neon/issues/7460 Closes: https://github.com/neondatabase/neon/issues/7463 ## Summary of changes - Add test test_storage_controller_many_tenants to exercise the system's behaviour with a more substantial workload. This test measures memory consumption and reproduces #7460 before the other changes in this PR. - Tweak reconcile_all's return value to make it nonzero if it spawns no reconcilers, but _would_ have spawned some reconcilers if they weren't blocked by the reconcile concurrency limit. This makes the test's reconcile_until_idle behave as expected (i.e. not complete until the system is nice and calm). - Fix an issue where tenant migrations would leave a spurious secondary location when migrated to some location that was not already their secondary (this was an existing low-impact bug that tripped up the test's consistency checks). On the test with 8000 shards, the resident memory per shard is about 20KiB. This is not really per-shard memory: the primary source of memory growth is the number of concurrent network/db clients we create. With 8000 shards, the test takes 125s to run on my workstation.	2024-04-30 15:21:54 +00:00
Arseny Sher	3a2f10712a	Add more context to s3 listing error.	2024-04-30 18:19:52 +03:00
Arseny Sher	4ac4b21598	Add retries to cloud_admin client.	2024-04-30 18:19:52 +03:00
Arseny Sher	9f792f9c0b	Recheck tenant_id in find_timeline_branch. As it turns out we have at least one case of the same timeline_id in different projects.	2024-04-30 18:19:52 +03:00
Arseny Sher	7434674d86	Decrease CONSOLE_CONCURRENCY. Last run with 128 created too much load on cplane.	2024-04-30 18:19:52 +03:00
Arseny Sher	ea37234ccc	s3_scrubber: revive garbage collection for safekeepers. - pageserver_id in project details is now is optional, fix it - add active_timeline_count guard/stat similar to active_tenant_count - fix safekeeper prefix - count and log deleted keys	2024-04-30 18:19:52 +03:00
Arseny Sher	3da54e6d90	s3_scrubber: implement scan-metadata for safekeepers. It works by listing postgres table with memory dump of safekeepers state. s3 contents for each timeline are checked then against timeline_start_lsn and backup_lsn. If inconsistency is found, before complaining timeline (branch) is checked at control plane; it might have been deleted between the dump take and s3 check.	2024-04-30 18:19:52 +03:00
Arpad Müller	010f0a310a	Make test_random_updates and test_read_at_max_lsn compatible with new compaction (#7551 ) Makes two of the tests work with the tiered compaction that I had to ignore in #7283. The issue was that tiered compaction actually created image layers, but the keys didn't appear in them as `collect_keyspace` didn't include them. Not a compaction problem, but due to how the test is structured. Fixes #7287	2024-04-30 16:52:54 +02:00
John Spray	eb53345d48	pageserver: reduce runtime of init_tenant_mgr (#7553 ) ## Problem `init_tenant_mgr` blocks the rest of pageserver startup, including starting the admin API. This was noticeable in #7475 , where the init_tenant_mgr runtime could be long enough to trip the controller's 30 second heartbeat timeout. ## Summary of changes - When detaching tenants during startup, spawn the background deletes as background tasks instead of doing them inline - Write all configs before spawning any tenants, so that the config writes aren't fighting tenants for system resources - Write configs with some concurrency (16) rather than writing them all sequentially.	2024-04-30 15:16:15 +01:00
Alex Chi Z	45c625fb34	feat(pageserver): separate sparse and dense keyspace (#7503 ) extracted (and tested) from https://github.com/neondatabase/neon/pull/7468, part of https://github.com/neondatabase/neon/issues/7462. The current codebase assumes the keyspace is dense -- which means that if we have a keyspace of 0x00-0x100, we assume every key (e.g., 0x00, 0x01, 0x02, ...) exists in the storage engine. However, the assumption does not hold any more in metadata keyspace. The metadata keyspace is sparse. It is impossible to do per-key check. Ideally, we should not have the assumption of dense keyspace at all, but this would incur a lot of refactors. Therefore, we split the keyspaces we have to dense/sparse and handle them differently in the code for now. At some point in the future, we should assume all keyspaces are sparse. ## Summary of changes * Split collect_keyspace to return dense+sparse keyspace. * Do not allow generating image layers for sparse keyspace (for now -- will fix this next week, we need image layers anyways). * Generate delta layers for sparse keyspace. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-04-30 09:39:10 -04:00
Cihan Demirci	84b6b95783	docs: fix unintentional file link (#7506 ) Not sure if this should actually be a link pointing to the `persistence.rs` file but following the conventions of the rest of the file, change `persistence.rs` reference to simply be a file name mention.	2024-04-30 14:17:01 +01:00
John Spray	577982b778	pageserver: remove workarounds from #7454 (#7550 ) PR #7454 included a workaround that let any existing bugged databases start up. Having used that already, we may now Closes: https://github.com/neondatabase/neon/issues/7480	2024-04-30 11:04:54 +01:00