pageserver: drop layers after shard split in GC

2026-07-04 12:40:37 +00:00 · 2024-04-30 11:29:14 +01:00
45 changed files with 513 additions and 2069 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1319,7 +1319,6 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "humantime-serde",
 "hyper 0.14.26",
 "nix 0.27.1",
 "once_cell",
@@ -3185,16 +3184,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3531,12 +3520,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -5112,11 +5095,8 @@ dependencies = [
 "hex",
 "histogram",
 "itertools",
- "native-tls",
 "pageserver",
 "pageserver_api",
- "postgres-native-tls",
- "postgres_ffi",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -5125,10 +5105,8 @@ dependencies = [
 "serde_with",
 "thiserror",
 "tokio",
- "tokio-postgres",
 "tokio-rustls 0.25.0",
 "tokio-stream",
- "tokio-util",
 "tracing",
 "tracing-appender",
 "tracing-subscriber",
@@ -6529,7 +6507,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
- "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/18
+++ b/18
@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	ifndef DISABLE_HOMEBREW
-		# macOS with brew-installed openssl requires explicit paths
-		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
-	endif
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -51,7 +51,6 @@ use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;
-use compute_api::spec::ComputeSpec;

 use compute_tools::compute::{
    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -69,29 +68,6 @@ use compute_tools::spec::*;
 const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
-    let (build_tag, clap_args) = init()?;
-
-    let (pg_handle, start_pg_result) =
-    {
-        // Enter startup tracing context
-        let _startup_context_guard = startup_context_from_env();
-
-        let cli_result = process_cli(&clap_args)?;
-
-        let wait_spec_result = wait_spec(build_tag, cli_result)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
-
-        // Startup is finished, exit the startup tracing context
-    };
-
-    // PostgreSQL is now running, if startup was successful. Wait until it exits.
-    let wait_pg_result = wait_postgres(pg_handle)?;
-
-    cleanup_and_exit(start_pg_result, wait_pg_result)
-}
-
-fn init() -> Result<(String, clap::ArgMatches)> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -106,11 +82,35 @@ fn init() -> Result<(String, clap::ArgMatches)> {
        .to_string();
    info!("build_tag: {build_tag}");

-    Ok((build_tag, cli().get_matches()))
-}
+    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let ext_remote_storage = matches
+        .get_one::<String>("remote-ext-config")
+        // Compatibility hack: if the control plane specified any remote-ext-config
+        // use the default value for extension storage proxy gateway.
+        // Remove this once the control plane is updated to pass the gateway URL
+        .map(|conf| {
+            if conf.starts_with("http") {
+                conf.trim_end_matches('/')
+            } else {
+                "http://pg-ext-s3-gateway"
+            }
+        });
+
+    let http_port = *matches
+        .get_one::<u16>("http-port")
+        .expect("http-port is required");
+    let pgdata = matches
+        .get_one::<String>("pgdata")
+        .expect("PGDATA path is required");
+    let connstr = matches
+        .get_one::<String>("connstr")
+        .expect("Postgres connection string is required");
+    let spec_json = matches.get_one::<String>("spec");
+    let spec_path = matches.get_one::<String>("spec-path");

-fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
-{
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -147,7 +147,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
    if let Ok(val) = std::env::var("TRACESTATE") {
        startup_tracing_carrier.insert("tracestate".to_string(), val);
    }
-    if !startup_tracing_carrier.is_empty() {
+    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
        use opentelemetry::propagation::TextMapPropagator;
        use opentelemetry::sdk::propagation::TraceContextPropagator;
        let guard = TraceContextPropagator::new()
@@ -157,42 +157,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
        Some(guard)
    } else {
        None
-    }
-}
+    };

-fn process_cli(
-    matches: &clap::ArgMatches,
-) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

@@ -233,45 +199,6 @@ fn process_cli(
        }
    };

-    let result = ProcessCliResult {
-        // directly from CLI:
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        // others:
-        spec,
-        live_config_allowed,
-    };
-
-    Ok(result)
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    live_config_allowed: bool,
-}
-
-fn wait_spec(
-    build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec,
-        live_config_allowed,
-    }: ProcessCliResult,
-) -> Result<WaitSpecResult> {
    let mut new_state = ComputeState::new();
    let spec_set;

@@ -310,6 +237,8 @@ fn wait_spec(
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
@@ -326,19 +255,6 @@ fn wait_spec(
        }
    }

-    Ok(WaitSpecResult { compute, http_port })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    // passed through from ProcessCliResult
-    http_port: u16,
-}
-
-fn start_postgres(
-    matches: &clap::ArgMatches,
-    WaitSpecResult { compute, http_port }: WaitSpecResult,
-) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();

@@ -365,10 +281,9 @@ fn start_postgres(
    let _monitor_handle = launch_monitor(&compute);
    let _configurator_handle = launch_configurator(&compute);

-    let extension_server_port: u16 = http_port;
-
    // Start Postgres
    let mut delay_exit = false;
+    let mut exit_code = None;
    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
@@ -419,7 +334,7 @@ fn start_postgres(
            // This token is used internally by the monitor to clean up all threads
            let token = CancellationToken::new();

-            let vm_monitor = rt.as_ref().map(|rt| {
+            let vm_monitor = &rt.as_ref().map(|rt| {
                rt.spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
@@ -432,43 +347,12 @@ fn start_postgres(
        }
    }

-    Ok((
-        pg,
-        StartPostgresResult {
-            delay_exit,
-            compute,
-            #[cfg(target_os = "linux")]
-            rt,
-            #[cfg(target_os = "linux")]
-            token,
-            #[cfg(target_os = "linux")]
-            vm_monitor,
-        },
-    ))
-}
-
-type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
-
-struct StartPostgresResult {
-    delay_exit: bool,
-    // passed through from WaitSpecResult
-    compute: Arc<ComputeNode>,
-
-    #[cfg(target_os = "linux")]
-    rt: Option<tokio::runtime::Runtime>,
-    #[cfg(target_os = "linux")]
-    token: tokio_util::sync::CancellationToken,
-    #[cfg(target_os = "linux")]
-    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
-}
-
-fn wait_postgres(
-    pg: Option<PostgresHandle>,
-) -> Result<WaitPostgresResult> {
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
-    let mut exit_code = None;
    if let Some((mut pg, logs_handle)) = pg {
+        // Startup is finished, exit the startup tracing span
+        drop(startup_context_guard);
+
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
@@ -483,26 +367,6 @@ fn wait_postgres(
        exit_code = ecode.code()
    }

-    Ok(WaitPostgresResult { exit_code })
-}
-
-struct WaitPostgresResult {
-    exit_code: Option<i32>,
-}
-
-fn cleanup_and_exit(
-    StartPostgresResult {
-        mut delay_exit,
-        compute,
-        #[cfg(target_os = "linux")]
-        vm_monitor,
-        #[cfg(target_os = "linux")]
-        token,
-        #[cfg(target_os = "linux")]
-        rt,
-    }: StartPostgresResult,
-    WaitPostgresResult { exit_code }: WaitPostgresResult,
-) -> Result<()> {
    // Terminate the vm_monitor so it releases the file watcher on
    // /sys/fs/cgroup/neon-postgres.
    // Note: the vm-monitor only runs on linux because it requires cgroups.
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,7 +17,6 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
-humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1554,8 +1554,8 @@ fn cli() -> Command {
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller"))
-                .subcommand(Command::new("stop").about("Stop storage controller")
+                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,7 +17,6 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use std::time::Duration;
 use utils::{
    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -67,10 +66,6 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    // Configuration for the storage controller (1 per neon_local environment)
-    #[serde(default)]
-    pub storage_controller: NeonStorageControllerConf,
-
    /// This Vec must always contain at least one pageserver
    pub pageservers: Vec<PageServerConf>,

@@ -103,29 +98,6 @@ pub struct NeonBroker {
    pub listen_addr: SocketAddr,
 }

-/// Broker config for cluster internal communication.
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
-pub struct NeonStorageControllerConf {
-    /// Heartbeat timeout before marking a node offline
-    #[serde(with = "humantime_serde")]
-    pub max_unavailable: Duration,
-}
-
-impl NeonStorageControllerConf {
-    // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
-        std::time::Duration::from_secs(10);
-}
-
-impl Default for NeonStorageControllerConf {
-    fn default() -> Self {
-        Self {
-            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
-        }
-    }
-}
-
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
    fn default() -> Self {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,7 +1,4 @@
-use crate::{
-    background_process,
-    local_env::{LocalEnv, NeonStorageControllerConf},
-};
+use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
@@ -35,13 +32,15 @@ pub struct StorageController {
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
-    config: NeonStorageControllerConf,
 }

 const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
-            config: env.storage_controller.clone(),
        }
    }

@@ -274,6 +272,8 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
        let mut args = vec![
            "-l",
            &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
-            &humantime::Duration::from(self.config.max_unavailable).to_string(),
+            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.

-The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.

 The `diesel` crate is used for defining models & migrations.

--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,10 +17,6 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

-/// A wrapper type for sparse keyspaces.
-#[derive(Clone, Debug, Default, PartialEq, Eq)]
-pub struct SparseKeySpace(pub KeySpace);
-
 /// Represents a contiguous half-open range of the keyspace, masked according to a particular
 /// ShardNumber's stripes: within this range of keys, only some "belong" to the current
 /// shard.
@@ -439,33 +435,10 @@ pub struct KeyPartitioning {
    pub parts: Vec<KeySpace>,
 }

-/// Represents a partitioning of the sparse key space.
-#[derive(Clone, Debug, Default)]
-pub struct SparseKeyPartitioning {
-    pub parts: Vec<SparseKeySpace>,
-}
-
 impl KeyPartitioning {
    pub fn new() -> Self {
        KeyPartitioning { parts: Vec::new() }
    }
-
-    /// Convert a key partitioning to a sparse partition.
-    pub fn into_sparse(self) -> SparseKeyPartitioning {
-        SparseKeyPartitioning {
-            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
-        }
-    }
-}
-
-impl SparseKeyPartitioning {
-    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
-    /// cause long/dead loops.
-    pub fn into_dense(self) -> KeyPartitioning {
-        KeyPartitioning {
-            parts: self.parts.into_iter().map(|x| x.0).collect(),
-        }
-    }
 }

 ///
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,11 +1,9 @@
 use utils::lsn::Lsn;

-use crate::keyspace::SparseKeySpace;
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
    pub keys: crate::keyspace::KeySpace,
-    pub sparse_keys: crate::keyspace::SparseKeySpace,
+
    pub at_lsn: Lsn,
 }

@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
        let mut map = serializer.serialize_map(Some(2))?;
        map.serialize_key("keys")?;
        map.serialize_value(&KeySpace(&self.keys))?;
-        map.serialize_key("sparse_keys")?;
-        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
        map.serialize_key("at_lsn")?;
        map.serialize_value(&WithDisplay(&self.at_lsn))?;
        map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        #[derive(serde::Deserialize)]
        struct De {
            keys: KeySpace,
-            sparse_keys: KeySpace,
            #[serde_as(as = "serde_with::DisplayFromStr")]
            at_lsn: Lsn,
        }
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        Ok(Self {
            at_lsn: de.at_lsn,
            keys: de.keys.0,
-            sparse_keys: SparseKeySpace(de.sparse_keys.0),
        })
    }
 }
@@ -139,12 +133,6 @@ mod tests {
                "030000000000000000000000000000000003"
              ]
            ],
-            "sparse_keys": [
-              [
-                "620000000000000000000000000000000000",
-                "620000000000000000000000000000000003"
-              ]
-            ],
            "at_lsn": "0/2240160"
        }
        "#;
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,10 +2,11 @@

 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
+use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{self, channel};
+use tokio::sync::watch::{channel, Receiver, Sender};
 use tokio::time::timeout;

 /// An error happened while waiting for a number
@@ -34,73 +35,23 @@ pub trait MonotonicCounter<V> {
    fn cnt_value(&self) -> V;
 }

-/// Heap of waiters, lowest numbers pop first.
-struct Waiters<V>
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
 where
+    S: MonotonicCounter<V>,
    V: Ord,
 {
-    heap: BinaryHeap<Waiter<V>>,
-    /// Number of the first waiter in the heap, or None if there are no waiters.
-    status_channel: watch::Sender<Option<V>>,
-}
-
-impl<V> Waiters<V>
-where
-    V: Ord + Copy,
-{
-    fn new() -> Self {
-        Waiters {
-            heap: BinaryHeap::new(),
-            status_channel: channel(None).0,
-        }
-    }
-
-    /// `status_channel` contains the number of the first waiter in the heap.
-    /// This function should be called whenever waiters heap changes.
-    fn update_status(&self) {
-        let first_waiter = self.heap.peek().map(|w| w.wake_num);
-        let _ = self.status_channel.send_replace(first_waiter);
-    }
-
-    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
-    fn add(&mut self, num: V) -> watch::Receiver<()> {
-        let (tx, rx) = channel(());
-        self.heap.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
-        self.update_status();
-        rx
-    }
-
-    /// Pop all waiters <= num from the heap. Collect channels in a vector,
-    /// so that caller can wake them up.
-    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
-        let mut wake_these = Vec::new();
-        while let Some(n) = self.heap.peek() {
-            if n.wake_num > num {
-                break;
-            }
-            wake_these.push(self.heap.pop().unwrap().wake_channel);
-        }
-        self.update_status();
-        wake_these
-    }
-
-    /// Used on shutdown to efficiently drop all waiters.
-    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
-        let heap = mem::take(&mut self.heap);
-        self.update_status();
-        heap
-    }
+    waiters: BinaryHeap<Waiter<V>>,
+    current: S,
+    shutdown: bool,
 }

 struct Waiter<T>
 where
    T: Ord,
 {
-    wake_num: T,                     // wake me when this number arrives ...
-    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
+    wake_num: T,              // wake me when this number arrives ...
+    wake_channel: Sender<()>, // ... by sending a message to this channel
 }

 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -125,17 +76,6 @@ impl<T: Ord> PartialEq for Waiter<T> {

 impl<T: Ord> Eq for Waiter<T> {}

-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
-where
-    S: MonotonicCounter<V>,
-    V: Ord,
-{
-    waiters: Waiters<V>,
-    current: S,
-    shutdown: bool,
-}
-
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -168,7 +108,7 @@ where
    /// Create a new `SeqWait`, initialized to a particular number
    pub fn new(starting_num: S) -> Self {
        let internal = SeqWaitInt {
-            waiters: Waiters::new(),
+            waiters: BinaryHeap::new(),
            current: starting_num,
            shutdown: false,
        };
@@ -188,8 +128,9 @@ where
            // Block any future waiters from starting
            internal.shutdown = true;

-            // Take all waiters to drop them later.
-            internal.waiters.take_all()
+            // This will steal the entire waiters map.
+            // When we drop it all waiters will be woken.
+            mem::take(&mut internal.waiters)

            // Drop the lock as we exit this scope.
        };
@@ -255,7 +196,7 @@ where

    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
        let mut internal = self.internal.lock().unwrap();
        if internal.current.cnt_value() >= num {
            return Ok(None);
@@ -264,8 +205,12 @@ where
            return Err(SeqWaitError::Shutdown);
        }

-        // Add waiter channel to the queue.
-        let rx = internal.waiters.add(num);
+        // Create a new channel.
+        let (tx, rx) = channel(());
+        internal.waiters.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
        // Drop the lock as we exit this scope.
        Ok(Some(rx))
    }
@@ -286,8 +231,16 @@ where
            }
            internal.current.cnt_advance(num);

-            // Pop all waiters <= num from the heap.
-            internal.waiters.pop_leq(num)
+            // Pop all waiters <= num from the heap. Collect them in a vector, and
+            // wake them up after releasing the lock.
+            let mut wake_these = Vec::new();
+            while let Some(n) = internal.waiters.peek() {
+                if n.wake_num > num {
+                    break;
+                }
+                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
+            }
+            wake_these
        };

        for tx in wake_these {
@@ -302,23 +255,6 @@ where
    pub fn load(&self) -> S {
        self.internal.lock().unwrap().current
    }
-
-    /// Get a Receiver for the current status.
-    ///
-    /// The current status is the number of the first waiter in the queue,
-    /// or None if there are no waiters.
-    ///
-    /// This receiver will be notified whenever the status changes.
-    /// It is useful for receiving notifications when the first waiter
-    /// starts waiting for a number, or when there are no more waiters left.
-    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
-        self.internal
-            .lock()
-            .unwrap()
-            .waiters
-            .status_channel
-            .subscribe()
-    }
 }

 #[cfg(test)]
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, Context};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,14 +38,6 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

-#[derive(Debug, thiserror::Error)]
-pub enum BasebackupError {
-    #[error("basebackup pageserver error {0:#}")]
-    Server(#[from] anyhow::Error),
-    #[error("basebackup client error {0:#}")]
-    Client(#[source] io::Error),
-}
-
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -61,7 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    ctx: &'a RequestContext,
-) -> Result<(), BasebackupError>
+) -> anyhow::Result<()>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -100,10 +92,8 @@ where

    // Consolidate the derived and the provided prev_lsn values
    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
-            return Err(BasebackupError::Server(anyhow!(
-                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
-            )));
+        if backup_prev != Lsn(0) {
+            ensure!(backup_prev == provided_prev_lsn);
        }
        provided_prev_lsn
    } else {
@@ -169,26 +159,15 @@ where
        }
    }

-    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
        let (kind, segno, _) = key_to_slru_block(*key)?;

        match kind {
            SlruKind::Clog => {
-                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
-                    return Err(BasebackupError::Server(anyhow!(
-                        "invalid SlruKind::Clog record: block.len()={}",
-                        block.len()
-                    )));
-                }
+                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
            }
            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                if block.len() != BLCKSZ as usize {
-                    return Err(BasebackupError::Server(anyhow!(
-                        "invalid {:?} record: block.len()={}",
-                        kind,
-                        block.len()
-                    )));
-                }
+                ensure!(block.len() == BLCKSZ as usize);
            }
        }

@@ -215,15 +194,12 @@ where
        Ok(())
    }

-    async fn flush(&mut self) -> Result<(), BasebackupError> {
+    async fn flush(&mut self) -> anyhow::Result<()> {
        let nblocks = self.buf.len() / BLCKSZ as usize;
        let (kind, segno) = self.current_segment.take().unwrap();
        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar
-            .append(&header, self.buf.as_slice())
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, self.buf.as_slice()).await?;

        self.total_blocks += nblocks;
        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -233,7 +209,7 @@ where
        Ok(())
    }

-    async fn finish(mut self) -> Result<(), BasebackupError> {
+    async fn finish(mut self) -> anyhow::Result<()> {
        let res = if self.current_segment.is_none() || self.buf.is_empty() {
            Ok(())
        } else {
@@ -250,7 +226,7 @@ impl<'a, W> Basebackup<'a, W>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
+    async fn send_tarball(mut self) -> anyhow::Result<()> {
        // TODO include checksum

        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -286,8 +262,7 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
+                .await?
                .partition(
                    self.timeline.get_shard_identity(),
                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -296,15 +271,10 @@ where
            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);

            for part in slru_partitions.parts {
-                let blocks = self
-                    .timeline
-                    .get_vectored(part, self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;

                for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
-                    slru_builder.add_block(&key, block).await?;
+                    slru_builder.add_block(&key, block?).await?;
                }
            }
            slru_builder.finish().await?;
@@ -312,11 +282,8 @@ where

        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;

@@ -325,8 +292,7 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -349,12 +315,7 @@ where
                }
            }

-            for (path, content) in self
-                .timeline
-                .list_aux_files(self.lsn, self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-            {
+            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
                if path.starts_with("pg_replslot") {
                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                    let restart_lsn = Lsn(u64::from_le_bytes(
@@ -385,41 +346,34 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
        {
            self.add_twophase_file(xid).await?;
        }

        fail_point!("basebackup-before-control-file", |_| {
-            Err(BasebackupError::Server(anyhow!(
-                "failpoint basebackup-before-control-file"
-            )))
+            bail!("failpoint basebackup-before-control-file")
        });

        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file().await?;
-        self.ar.finish().await.map_err(BasebackupError::Client)?;
+        self.ar.finish().await?;
        debug!("all tarred up!");
        Ok(())
    }

    /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        // If the relation is empty, create an empty file
        if nblocks == 0 {
            let file_name = dst.to_segfile_name(0);
            let header = new_tar_header(&file_name, 0)?;
-            self.ar
-                .append(&header, &mut io::empty())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, &mut io::empty()).await?;
            return Ok(());
        }

@@ -434,17 +388,13 @@ where
                let img = self
                    .timeline
                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }

            let file_name = dst.to_segfile_name(seg as u32);
            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar
-                .append(&header, segment_data.as_slice())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, segment_data.as_slice()).await?;

            seg += 1;
            startblk = endblk;
@@ -464,22 +414,20 @@ where
        spcnode: u32,
        dbnode: u32,
        has_relmap_file: bool,
-    ) -> Result<(), BasebackupError> {
+    ) -> anyhow::Result<()> {
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;

-            if img.len()
-                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
-            {
-                return Err(BasebackupError::Server(anyhow!(
-                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
-                    img.len(),
-                )));
-            }
+            ensure!(
+                img.len()
+                    == dispatch_pgversion!(
+                        self.timeline.pg_version,
+                        pgv::bindings::SIZEOF_RELMAPFILE
+                    )
+            );

            Some(img)
        } else {
@@ -492,20 +440,14 @@ where
                ver => format!("{ver}\x0A"),
            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar
-                .append(&header, pg_version_str.as_bytes())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, pg_version_str.as_bytes()).await?;

            info!("timeline.pg_version {}", self.timeline.pg_version);

            if let Some(img) = relmap_img {
                // filenode map for global tablespace
                let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar
-                    .append(&header, &img[..])
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, &img[..]).await?;
            } else {
                warn!("global/pg_filenode.map is missing");
            }
@@ -524,26 +466,18 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                    .is_empty()
            {
                return Ok(());
            }
            // User defined tablespaces are not supported
-            if spcnode != DEFAULTTABLESPACE_OID {
-                return Err(BasebackupError::Server(anyhow!(
-                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
-                )));
-            }
+            ensure!(spcnode == DEFAULTTABLESPACE_OID);

            // Append dir path for each database
            let path = format!("base/{}", dbnode);
            let header = new_tar_header_dir(&path)?;
-            self.ar
-                .append(&header, &mut io::empty())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, &mut io::empty()).await?;

            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -553,17 +487,11 @@ where
                    ver => format!("{ver}\x0A"),
                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar
-                    .append(&header, pg_version_str.as_bytes())
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, pg_version_str.as_bytes()).await?;

                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar
-                    .append(&header, &img[..])
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, &img[..]).await?;
            }
        };
        Ok(())
@@ -572,12 +500,11 @@ where
    //
    // Extract twophase state files
    //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -585,10 +512,7 @@ where
        buf.put_u32_le(crc);
        let path = format!("pg_twophase/{:>08X}", xid);
        let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar
-            .append(&header, &buf[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, &buf[..]).await?;

        Ok(())
    }
@@ -597,28 +521,24 @@ where
    // Add generated pg_control file and bootstrap WAL segment.
    // Also send zenith.signal file with extra bootstrap data.
    //
-    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
+    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
            if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                write!(zenith_signal, "PREV LSN: none")?;
            } else {
-                write!(zenith_signal, "PREV LSN: invalid")
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                write!(zenith_signal, "PREV LSN: invalid")?;
            }
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
        }
        self.ar
            .append(
                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                zenith_signal.as_bytes(),
            )
-            .await
-            .map_err(BasebackupError::Client)?;
+            .await?;

        let checkpoint_bytes = self
            .timeline
@@ -640,10 +560,7 @@ where

        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar
-            .append(&header, &pg_control_bytes[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, &pg_control_bytes[..]).await?;

        //send wal segment
        let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -658,16 +575,8 @@ where
            self.lsn,
        )
        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        if wal_seg.len() != WAL_SEGMENT_SIZE {
-            return Err(BasebackupError::Server(anyhow!(
-                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
-                wal_seg.len()
-            )));
-        }
-        self.ar
-            .append(&header, &wal_seg[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
+        self.ar.append(&header, &wal_seg[..]).await?;
        Ok(())
    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1918,14 +1918,12 @@ async fn timeline_collect_keyspace(
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let (dense_ks, sparse_ks) = timeline
+        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
-        // Therefore, we split dense/sparse keys in this API.
-        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
+        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };

        json_response(StatusCode::OK, res)
    }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -48,7 +48,6 @@ use utils::{

 use crate::auth::check_permission;
 use crate::basebackup;
-use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
@@ -1237,13 +1236,6 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        fn map_basebackup_error(err: BasebackupError) -> QueryError {
-            match err {
-                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
-                BasebackupError::Server(e) => QueryError::Other(e),
-            }
-        }
-
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -1269,8 +1261,7 @@ impl PageServerHandler {
        let lsn_awaited_after = started.elapsed();

        // switch client to COPYOUT
-        pgb.write_message_noflush(&BeMessage::CopyOutResponse)
-            .map_err(QueryError::Disconnected)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
@@ -1285,8 +1276,7 @@ impl PageServerHandler {
                full_backup,
                ctx,
            )
-            .await
-            .map_err(map_basebackup_error)?;
+            .await?;
        } else {
            let mut writer = pgb.copyout_writer();
            if gzip {
@@ -1307,13 +1297,9 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await
-                .map_err(map_basebackup_error)?;
+                .await?;
                // shutdown the encoder to ensure the gzip footer is written
-                encoder
-                    .shutdown()
-                    .await
-                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
+                encoder.shutdown().await?;
            } else {
                basebackup::send_basebackup_tarball(
                    &mut writer,
@@ -1323,13 +1309,11 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await
-                .map_err(map_basebackup_error)?;
+                .await?;
            }
        }

-        pgb.write_message_noflush(&BeMessage::CopyDone)
-            .map_err(QueryError::Disconnected)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        let basebackup_after = started
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,7 +23,6 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
-use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -731,13 +730,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    ///
-    /// The return value is (dense keyspace, sparse keyspace).
    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -809,12 +806,7 @@ impl Timeline {
        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
            result.add_key(AUX_FILES_KEY);
        }
-
-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
-        ))
+        Ok(result.to_keyspace())
    }

    /// Get cached size of relation if it not updated after specified LSN
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3873,7 +3873,6 @@ mod tests {
    use hex_literal::hex;
    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
    use rand::{thread_rng, Rng};
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4513,23 +4512,11 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        timeline: Arc<Timeline>,
-        ctx: &RequestContext,
-        lsn: Lsn,
-        repeat: usize,
-        key_count: usize,
-    ) -> anyhow::Result<()> {
-        let compact = true;
-        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
-    }
-
-    async fn bulk_insert_maybe_compact_gc(
        timeline: Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
        repeat: usize,
        key_count: usize,
-        compact: bool,
    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;
@@ -4570,11 +4557,9 @@ mod tests {
                )
                .await?;
            timeline.freeze_and_flush().await?;
-            if compact {
-                timeline
-                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                    .await?;
-            }
+            timeline
+                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                .await?;
            timeline.gc().await?;
        }

@@ -5057,22 +5042,7 @@ mod tests {

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
-            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_random_updates_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_random_updates_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_random_updates")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5137,7 +5107,7 @@ mod tests {
                );
            }

-            // Perform a cycle of flush, and GC
+            // Perform a cycle of flush, compact, and GC
            let cutoff = tline.get_last_record_lsn();
            tline
                .update_gc_info(
@@ -5149,6 +5119,9 @@ mod tests {
                )
                .await?;
            tline.freeze_and_flush().await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
            tline.gc().await?;
        }

@@ -5429,36 +5402,19 @@ mod tests {

    #[tokio::test]
    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
-            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_read_at_max_lsn_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_read_at_max_lsn_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_read_at_max_lsn")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
-        let compact = false;
-        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let read_lsn = Lsn(u64::MAX - 1);

-        let result = tline.get(test_key, read_lsn, &ctx).await;
-        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
+        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());

        Ok(())
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -916,7 +916,6 @@ mod tests {
        assert_eq!(lhs, rhs);
    }

-    #[cfg(test)]
    fn brute_force_range_search(
        layer_map: &LayerMap,
        key_range: Range<Key>,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,6 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -254,15 +253,17 @@ impl TenantsMap {
    }
 }

-/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
-/// the slower actual deletion in the background.
-///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
+async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
+    let tmp_path = safe_rename_tenant_dir(path).await?;
+    fs::remove_dir_all(tmp_path).await
+}
+
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
    let parent = path
        .as_ref()
@@ -285,28 +286,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
    Ok(tmp_path)
 }

-/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-/// the background, and thereby avoid blocking any API requests on this deletion completing.
-fn spawn_background_purge(tmp_path: Utf8PathBuf) {
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-}
-
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

@@ -591,11 +570,7 @@ pub async fn init_tenant_mgr(
    );
    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

-    // Accumulate futures for writing tenant configs, so that we can execute in parallel
-    let mut config_write_futs = Vec::new();
-
-    // Update the location configs according to the re-attach response and persist them to disk
-    tracing::info!("Updating {} location configs", tenant_configs.len());
+    // Construct `Tenant` objects and start them running
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);

@@ -622,22 +597,18 @@ pub async fn init_tenant_mgr(
        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
            SecondaryLocationConfig { warm: true };

+        // Update the location config according to the re-attach response
        if let Some(tenant_modes) = &tenant_modes {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
            match tenant_modes.get(&tenant_shard_id) {
                None => {
                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-
-                    match safe_rename_tenant_dir(&tenant_dir_path).await {
-                        Ok(tmp_path) => {
-                            spawn_background_purge(tmp_path);
-                        }
-                        Err(e) => {
-                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
-                        }
-                    };
+                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                        );
+                    }

                    // We deleted local content: move on to next tenant, don't try and spawn this one.
                    continue;
@@ -683,32 +654,8 @@ pub async fn init_tenant_mgr(

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
-        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
-            (tenant_shard_id, location_conf, r)
-        });
-    }
+        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
-    tracing::info!(
-        "Writing {} location config files...",
-        config_write_futs.len()
-    );
-    let config_write_results = futures::stream::iter(config_write_futs)
-        .buffer_unordered(16)
-        .collect::<Vec<_>>()
-        .await;
-
-    tracing::info!(
-        "Spawning {} tenant shard locations...",
-        config_write_results.len()
-    );
-    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
-    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
-        config_write_result?;
-
-        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
            LocationMode::Attached(attached_conf) => {
@@ -1752,7 +1699,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1907,6 +1854,28 @@ impl TenantManager {
        shutdown_all_tenants0(self.tenants).await
    }

+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
    pub(crate) async fn detach_tenant(
        &self,
        conf: &'static PageServerConf,
@@ -1923,7 +1892,7 @@ impl TenantManager {
                deletion_queue_client,
            )
            .await?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        Ok(())
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -597,17 +597,14 @@ impl InMemoryLayer {
        }
    }

-    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
-    /// layer will only contain the key range the user specifies, and may return `None`
-    /// if there are no matching keys.
+    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
    pub(crate) async fn write_to_disk(
        &self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+    ) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -621,21 +618,6 @@ impl InMemoryLayer {

        let end_lsn = *self.end_lsn.get().unwrap();

-        let keys: Vec<_> = if let Some(key_range) = key_range {
-            inner
-                .index
-                .iter()
-                .filter(|(k, _)| key_range.contains(k))
-                .map(|(k, m)| (k.to_i128(), m))
-                .collect()
-        } else {
-            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
-        };
-
-        if keys.is_empty() {
-            return Ok(None);
-        }
-
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
@@ -667,6 +649,6 @@ impl InMemoryLayer {

        // MAX is used here because we identify L0 layers by full key range
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
-        Ok(Some(delta_layer))
+        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -401,8 +401,8 @@ impl Layer {
        &self.0.path
    }

-    pub(crate) fn debug_str(&self) -> &Arc<str> {
-        &self.0.debug_str
+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        &self.0.path_str
    }

    pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -527,8 +527,8 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

-    /// String representation of the layer, used for traversal id.
-    debug_str: Arc<str>,
+    /// String representation of the full path, used for traversal id.
+    path_str: Arc<str>,

    desc: PersistentLayerDesc,

@@ -735,7 +735,7 @@ impl LayerInner {

        LayerInner {
            conf,
-            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
+            path_str: path.to_string().into(),
            path,
            desc,
            timeline: Arc::downgrade(timeline),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,13 +17,13 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
-    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
+    keyspace::{KeySpaceAccum, ShardedRange},
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
-    shard::{ShardIdentity, ShardNumber, TenantShardId},
+    shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -55,6 +55,7 @@ use std::{
    ops::ControlFlow,
 };

+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -65,7 +66,6 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
-use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -137,25 +137,6 @@ pub(super) enum FlushLoopState {
    Exited,
 }

-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub enum ImageLayerCreationMode {
-    /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
-    Try,
-    /// Force creating the image layers if possible. For now, no image layers will be created
-    /// for metadata keys. Used in compaction code path with force flag enabled.
-    Force,
-    /// Initial ingestion of the data, and no data should be dropped in this function. This
-    /// means that no metadata keys should be included in the partitions. Used in flush frozen layer
-    /// code path.
-    Initial,
-}
-
-impl std::fmt::Display for ImageLayerCreationMode {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) struct Hole {
@@ -336,7 +317,7 @@ pub struct Timeline {
    pub initdb_lsn: Lsn,

    /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,

    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,
@@ -1253,12 +1234,6 @@ impl Timeline {
        self.last_record_lsn.load()
    }

-    /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
-    /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
-    pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
-        self.last_record_lsn.status_receiver()
-    }
-
    pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn.load()
    }
@@ -2129,10 +2104,7 @@ impl Timeline {
                    // initial logical size is 0.
                    LogicalSize::empty_initial()
                },
-                partitioning: tokio::sync::Mutex::new((
-                    (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
-                    Lsn(0),
-                )),
+                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),

@@ -2948,7 +2920,7 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Layer {
    fn traversal_id(&self) -> TraversalId {
-        Arc::clone(self.debug_str())
+        Arc::clone(self.local_path_str())
    }
 }

@@ -3134,6 +3106,7 @@ impl Timeline {
            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                let layer = guard.get_from_desc(&layer);
                drop(guard);
+
                // Get all the data needed to reconstruct the page version from this layer.
                // But if we have an older cached page image, no need to go past that.
                let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3254,7 +3227,7 @@ impl Timeline {
        Ok(())
    }

-    /// Collect the reconstruct data for a keyspace from the specified timeline.
+    /// Collect the reconstruct data for a ketspace from the specified timeline.
    ///
    /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
    /// the current keyspace. The current keyspace of the search at any given timeline
@@ -3683,103 +3656,66 @@ impl Timeline {
        // files instead. This is possible as long as *all* the data imported into the
        // repository have the same LSN.
        let lsn_range = frozen_layer.get_lsn_range();
-
-        // Whether to directly create image layers for this flush, or flush them as delta layers
-        let create_image_layer =
-            lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
-
-        #[cfg(test)]
-        {
-            match &mut *self.flush_loop_state.lock().unwrap() {
-                FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                    panic!("flush loop not running")
-                }
-                FlushLoopState::Running {
-                    expect_initdb_optimization,
-                    initdb_optimization_count,
-                    ..
-                } => {
-                    if create_image_layer {
+        let (layers_to_upload, delta_layer_to_add) =
+            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+                #[cfg(test)]
+                match &mut *self.flush_loop_state.lock().unwrap() {
+                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                        panic!("flush loop not running")
+                    }
+                    FlushLoopState::Running {
+                        initdb_optimization_count,
+                        ..
+                    } => {
                        *initdb_optimization_count += 1;
-                    } else {
+                    }
+                }
+                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+                // require downloading anything during initial import.
+                let (partitioning, _lsn) = self
+                    .repartition(
+                        self.initdb_lsn,
+                        self.get_compaction_target_size(),
+                        EnumSet::empty(),
+                        ctx,
+                    )
+                    .await?;
+
+                if self.cancel.is_cancelled() {
+                    return Err(FlushLayerError::Cancelled);
+                }
+
+                // For image layers, we add them immediately into the layer map.
+                (
+                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
+                        .await?,
+                    None,
+                )
+            } else {
+                #[cfg(test)]
+                match &mut *self.flush_loop_state.lock().unwrap() {
+                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                        panic!("flush loop not running")
+                    }
+                    FlushLoopState::Running {
+                        expect_initdb_optimization,
+                        ..
+                    } => {
                        assert!(!*expect_initdb_optimization, "expected initdb optimization");
                    }
                }
-            }
-        }
-
-        let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
-            // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
-            // require downloading anything during initial import.
-            let ((rel_partition, metadata_partition), _lsn) = self
-                .repartition(
-                    self.initdb_lsn,
-                    self.get_compaction_target_size(),
-                    EnumSet::empty(),
-                    ctx,
+                // Normal case, write out a L0 delta layer file.
+                // `create_delta_layer` will not modify the layer map.
+                // We will remove frozen layer and add delta layer in one atomic operation later.
+                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
+                (
+                    // FIXME: even though we have a single image and single delta layer assumption
+                    // we push them to vec
+                    vec![layer.clone()],
+                    Some(layer),
                )
-                .await?;
-
-            if self.cancel.is_cancelled() {
-                return Err(FlushLayerError::Cancelled);
-            }
-
-            // For metadata, always create delta layers.
-            let delta_layer = if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single aux file keyspace"
-                );
-                let metadata_keyspace = &metadata_partition.parts[0];
-                assert_eq!(
-                    metadata_keyspace.0.ranges.len(),
-                    1,
-                    "aux file keyspace should be a single range"
-                );
-                self.create_delta_layer(
-                    &frozen_layer,
-                    ctx,
-                    Some(metadata_keyspace.0.ranges[0].clone()),
-                )
-                .await?
-            } else {
-                None
            };

-            // For image layers, we add them immediately into the layer map.
-            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
-                    &rel_partition,
-                    self.initdb_lsn,
-                    ImageLayerCreationMode::Initial,
-                    ctx,
-                )
-                .await?,
-            );
-
-            if let Some(delta_layer) = delta_layer {
-                layers_to_upload.push(delta_layer.clone());
-                (layers_to_upload, Some(delta_layer))
-            } else {
-                (layers_to_upload, None)
-            }
-        } else {
-            // Normal case, write out a L0 delta layer file.
-            // `create_delta_layer` will not modify the layer map.
-            // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
-                panic!("delta layer cannot be empty if no filter is applied");
-            };
-            (
-                // FIXME: even though we have a single image and single delta layer assumption
-                // we push them to vec
-                vec![layer.clone()],
-                Some(layer),
-            )
-        };
-
        pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");

        if self.cancel.is_cancelled() {
@@ -3899,18 +3835,12 @@ impl Timeline {
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
        ctx: &RequestContext,
-        key_range: Option<Range<Key>>,
-    ) -> anyhow::Result<Option<ResidentLayer>> {
+    ) -> anyhow::Result<ResidentLayer> {
        let self_clone = Arc::clone(self);
        let frozen_layer = Arc::clone(frozen_layer);
        let ctx = ctx.attached_child();
        let work = async move {
-            let Some(new_delta) = frozen_layer
-                .write_to_disk(&self_clone, &ctx, key_range)
-                .await?
-            else {
-                return Ok(None);
-            };
+            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
            // We just need to fsync the directory in which these inodes are linked,
            // which we know to be the timeline directory.
@@ -3929,7 +3859,7 @@ impl Timeline {
                .sync_all()
                .await
                .fatal_err("VirtualFile::sync_all timeline dir");
-            anyhow::Ok(Some(new_delta))
+            anyhow::Ok(new_delta)
        };
        // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
        // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3956,20 +3886,19 @@ impl Timeline {
        partition_size: u64,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
+    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
            // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
            // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
            // and hence before the compaction task starts.
            anyhow::bail!("repartition() called concurrently, this should not happen");
        };
-        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
-        if lsn < *partition_lsn {
+        if lsn < partitioning_guard.1 {
            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
        }

-        let distance = lsn.0 - partition_lsn.0;
-        if *partition_lsn != Lsn(0)
+        let distance = lsn.0 - partitioning_guard.1 .0;
+        if partitioning_guard.1 != Lsn(0)
            && distance <= self.repartition_threshold
            && !flags.contains(CompactFlags::ForceRepartition)
        {
@@ -3978,18 +3907,13 @@ impl Timeline {
                threshold = self.repartition_threshold,
                "no repartitioning needed"
            );
-            return Ok((
-                (dense_partition.clone(), sparse_partition.clone()),
-                *partition_lsn,
-            ));
+            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
        }

-        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
-        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
-        let sparse_partitioning = SparseKeyPartitioning {
-            parts: vec![sparse_ks],
-        }; // no partitioning for metadata keys for now
-        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
+        let keyspace = self.collect_keyspace(lsn, ctx).await?;
+        let partitioning = keyspace.partition(&self.shard_identity, partition_size);
+
+        *partitioning_guard = (partitioning, lsn);

        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
    }
@@ -4045,12 +3969,12 @@ impl Timeline {
        false
    }

-    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
+    #[tracing::instrument(skip_all, fields(%lsn, %force))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
        partitioning: &KeyPartitioning,
        lsn: Lsn,
-        mode: ImageLayerCreationMode,
+        force: bool,
        ctx: &RequestContext,
    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
        let timer = self.metrics.create_images_time_histo.start_timer();
@@ -4087,26 +4011,19 @@ impl Timeline {
        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;

-            if partition.overlaps(&Key::metadata_key_range()) {
-                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
-                // rather big change. Keep this patch small for now.
-                match mode {
-                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
-                        // skip image layer creation anyways for metadata keys.
-                        start = img_range.end;
-                        continue;
-                    }
-                    ImageLayerCreationMode::Initial => {
-                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                    }
-                }
-            } else if let ImageLayerCreationMode::Try = mode {
-                // check_for_image_layers = false -> skip
-                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
-                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
-                    start = img_range.end;
-                    continue;
-                }
+            let do_it = if force {
+                true
+            } else if check_for_image_layers {
+                // [`Self::time_for_new_image_layer`] is CPU expensive,
+                // so skip if we've not collected enough WAL since the last time
+                self.time_for_new_image_layer(partition, lsn).await
+            } else {
+                false
+            };
+
+            if !do_it {
+                start = img_range.end;
+                continue;
            }

            let mut image_layer_writer = ImageLayerWriter::new(
@@ -4601,6 +4518,28 @@ impl Timeline {
        'outer: for l in layers.iter_historic_layers() {
            result.layers_total += 1;

+            // 0. Is this layer a relic from a shard split?
+            //    (Do this check first because irrespective of later logic regarding LSNs, this
+            //    layer should be dropped.)
+            if self.shard_identity.count >= ShardCount::new(2) {
+                // We are a sharded tenant
+                let layer = guard.get_from_desc(&l);
+                if layer.metadata().shard != self.tenant_shard_id.to_index() {
+                    // This is an ancestral layer
+                    let sharded_range = ShardedRange::new(l.get_key_range(), &self.shard_identity);
+                    if sharded_range.page_count() == 0 {
+                        // This ancestral layer only covers keys that belong to other shards
+                        info!(
+                            "garbate collecting layer {} ({:?}) after shard split",
+                            l.filename(),
+                            l.get_key_range()
+                        );
+                        layers_to_remove.push(l);
+                        continue;
+                    }
+                }
+            }
+
            // 1. Is it newer than GC horizon cutoff point?
            if l.get_lsn_range().end > horizon_cutoff {
                debug!(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
+use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -102,7 +102,7 @@ impl Timeline {
            )
            .await
        {
-            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+            Ok((partitioning, lsn)) => {
                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                let image_ctx = RequestContextBuilder::extend(ctx)
                    .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,37 +115,17 @@ impl Timeline {

                // 3. Create new image layers for partitions that have been modified
                // "enough".
-                let dense_layers = self
+                let layers = self
                    .create_image_layers(
-                        &dense_partitioning,
+                        &partitioning,
                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
                        &image_ctx,
                    )
                    .await
                    .map_err(anyhow::Error::from)?;

-                // For now, nothing will be produced...
-                let sparse_layers = self
-                    .create_image_layers(
-                        &sparse_partitioning.clone().into_dense(),
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                assert!(sparse_layers.is_empty());
-
-                self.upload_new_image_layers(dense_layers)?;
+                self.upload_new_image_layers(layers)?;
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -778,9 +758,8 @@ impl Timeline {
            return Err(CompactionError::ShuttingDown);
        }

-        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
-        // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
-        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
+        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));

        pageserver_compaction::compact_tiered::compact_tiered(
            &mut adaptor,
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -22,12 +22,10 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
-
+use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::proto::{
-    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
-    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
-};
 use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -91,14 +89,6 @@ pub(super) async fn connection_manager_loop_step(
        .timeline
        .subscribe_for_state_updates();

-    let mut wait_lsn_status = connection_manager_state
-        .timeline
-        .subscribe_for_wait_lsn_updates();
-
-    // TODO: create a separate config option for discovery request interval
-    let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
-    let mut last_discovery_ts: Option<std::time::Instant> = None;
-
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
@@ -107,12 +97,10 @@ pub(super) async fn connection_manager_loop_step(

    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
-        let any_activity = connection_manager_state.wal_connection.is_some()
-            || !connection_manager_state.wal_stream_candidates.is_empty();

        // These things are happening concurrently:
        //
-        //  - cancellation request
+        // - cancellation request
        //  - keep receiving WAL on the current connection
        //      - if the shared state says we need to change connection, disconnect and return
        //      - this runs in a separate task and we receive updates via a watch channel
@@ -120,7 +108,6 @@ pub(super) async fn connection_manager_loop_step(
        //  - receive updates from broker
        //      - this might change the current desired connection
        //  - timeline state changes to something that does not allow walreceiver to run concurrently
-        //  - if there's no connection and no candidates, try to send a discovery request

        // NB: make sure each of the select expressions are cancellation-safe
        // (no need for arms to be cancellation-safe).
@@ -227,65 +214,6 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
-
-            Some(()) = async {
-                // Reminder: this match arm needs to be cancellation-safe.
-                // Calculating time needed to wait until sending the next discovery request.
-                // Current implementation is conservative and sends discovery requests only when there are no candidates.
-
-                if any_activity {
-                    // No need to send discovery requests if there is an active connection or candidates.
-                    return None;
-                }
-
-                // Waiting for an active wait_lsn request.
-                while wait_lsn_status.borrow().is_none() {
-                    if wait_lsn_status.changed().await.is_err() {
-                        // wait_lsn_status channel was closed, exiting
-                        warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
-                        return None;
-                    }
-                }
-
-                // All preconditions met, preparing to send a discovery request.
-                let now = std::time::Instant::now();
-                let next_discovery_ts = last_discovery_ts
-                    .map(|ts| ts + discovery_request_interval)
-                    .unwrap_or_else(|| now);
-
-                if next_discovery_ts > now {
-                    // Prevent sending discovery requests too frequently.
-                    tokio::time::sleep(next_discovery_ts - now).await;
-                }
-
-                let tenant_timeline_id = Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                });
-                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
-                let msg = TypedMessage {
-                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
-                    safekeeper_timeline_info: None,
-                    safekeeper_discovery_request: Some(request),
-                    safekeeper_discovery_response: None,
-                    };
-
-                last_discovery_ts = Some(std::time::Instant::now());
-                debug!("No active connection and no candidates, sending discovery request to the broker");
-
-                // Cancellation safety: we want to send a message to the broker, but publish_one()
-                // function can get cancelled by the other select! arm. This is absolutely fine, because
-                // we just want to receive broker updates and discovery is not important if we already
-                // receive updates.
-                //
-                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
-                // This is totally fine because of the reason above.
-
-                // This is a fire-and-forget request, we don't care about the response
-                let _ = broker_client.publish_one(msg).await;
-                debug!("Discovery request sent to the broker");
-                None
-            } => {}
        }

        if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -303,7 +231,7 @@ async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
    id: TenantTimelineId,
    cancel: &CancellationToken,
-) -> Result<Streaming<TypedMessage>, Cancelled> {
+) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
    let mut attempt = 0;
    loop {
        exponential_backoff(
@@ -316,27 +244,17 @@ async fn subscribe_for_timeline_updates(
        attempt += 1;

        // subscribe to the specific timeline
-        let request = SubscribeByFilterRequest {
-            types: vec![
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperTimelineInfo as i32,
-                },
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
-                },
-            ],
-            tenant_timeline_id: Some(FilterTenantTimelineId {
-                enabled: true,
-                tenant_timeline_id: Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                }),
-            }),
+        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
+            tenant_id: id.tenant_id.as_ref().to_owned(),
+            timeline_id: id.timeline_id.as_ref().to_owned(),
+        });
+        let request = SubscribeSafekeeperInfoRequest {
+            subscription_key: Some(key),
        };

        match {
            tokio::select! {
-                r = broker_client.subscribe_by_filter(request) => { r }
+                r = broker_client.subscribe_safekeeper_info(request) => { r }
                _ = cancel.cancelled() => { return Err(Cancelled); }
            }
        } {
@@ -480,7 +398,7 @@ struct RetryInfo {
 /// Data about the timeline to connect to, received from the broker.
 #[derive(Debug, Clone)]
 struct BrokerSkTimeline {
-    timeline: SafekeeperDiscoveryResponse,
+    timeline: SafekeeperTimelineInfo,
    /// Time at which the data was fetched from the broker last time, to track the stale data.
    latest_update: NaiveDateTime,
 }
@@ -688,41 +606,7 @@ impl ConnectionManagerState {
    }

    /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
-        let mut is_discovery = false;
-        let timeline_update = match typed_msg.r#type() {
-            MessageType::SafekeeperTimelineInfo => {
-                let info = match typed_msg.safekeeper_timeline_info {
-                    Some(info) => info,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_timeline_info");
-                        return;
-                    }
-                };
-                SafekeeperDiscoveryResponse {
-                    safekeeper_id: info.safekeeper_id,
-                    tenant_timeline_id: info.tenant_timeline_id,
-                    commit_lsn: info.commit_lsn,
-                    safekeeper_connstr: info.safekeeper_connstr,
-                    availability_zone: info.availability_zone,
-                }
-            }
-            MessageType::SafekeeperDiscoveryResponse => {
-                is_discovery = true;
-                match typed_msg.safekeeper_discovery_response {
-                    Some(response) => response,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_discovery_response");
-                        return;
-                    }
-                }
-            }
-            _ => {
-                // unexpected message
-                return;
-            }
-        };
-
+    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
        WALRECEIVER_BROKER_UPDATES.inc();

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -735,11 +619,7 @@ impl ConnectionManagerState {
        );

        if old_entry.is_none() {
-            info!(
-                ?is_discovery,
-                %new_safekeeper_id,
-                "New SK node was added",
-            );
+            info!("New SK node was added: {new_safekeeper_id}");
            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }
@@ -938,7 +818,7 @@ impl ConnectionManagerState {
    fn select_connection_candidate(
        &self,
        node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
        self.applicable_connection_candidates()
            .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
            .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -948,7 +828,7 @@ impl ConnectionManagerState {
    /// Some safekeepers are filtered by the retry cooldown.
    fn applicable_connection_candidates(
        &self,
-    ) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
        let now = Utc::now().naive_utc();

        self.wal_stream_candidates
@@ -1088,11 +968,19 @@ mod tests {
        latest_update: NaiveDateTime,
    ) -> BrokerSkTimeline {
        BrokerSkTimeline {
-            timeline: SafekeeperDiscoveryResponse {
+            timeline: SafekeeperTimelineInfo {
                safekeeper_id: 0,
                tenant_timeline_id: None,
+                term: 0,
+                last_log_term: 0,
+                flush_lsn: 0,
                commit_lsn,
+                backup_lsn: 0,
+                remote_consistent_lsn: 0,
+                peer_horizon_lsn: 0,
+                local_start_lsn: 0,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
+                http_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
            },
            latest_update,
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,12 +22,7 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
-native-tls.workspace = true
-postgres-native-tls.workspace = true
-postgres_ffi.workspace = true
 tokio-stream.workspace = true
-tokio-postgres.workspace = true
-tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -67,12 +67,10 @@ the purge command will log all the keys that it would have deleted.

 #### `scan-metadata`

-Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
-Errors are logged to stderr and summary to stdout.
+Walk objects in a pageserver S3 bucket, and report statistics on the contents.

-For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
+env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata

 Timelines: 31106
 With errors: 3
@@ -84,10 +82,6 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
 ```

-For safekeepers, dump_db_connstr and dump_db_table must be
-specified; they should point to table with debug dump which will be used
-to list timelines and find their backup and start LSNs.
-
 ## Cleaning up running pageservers

 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,13 +1,11 @@
-use chrono::{DateTime, Utc};
-use futures::Future;
-use hex::FromHex;
+use std::time::Duration;

+use chrono::{DateTime, Utc};
+use hex::FromHex;
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;

-use tokio_util::sync::CancellationToken;
-use utils::backoff;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -139,7 +137,7 @@ pub struct ProjectData {
    pub region_id: String,
    pub platform_id: String,
    pub user_id: String,
-    pub pageserver_id: Option<u64>,
+    pub pageserver_id: u64,
    #[serde(deserialize_with = "from_nullable_id")]
    pub tenant: TenantId,
    pub safekeepers: Vec<SafekeeperData>,
@@ -157,7 +155,7 @@ pub struct ProjectData {
    pub maintenance_set: Option<String>,
 }

-#[derive(Debug, Clone, serde::Deserialize)]
+#[derive(Debug, serde::Deserialize)]
 pub struct BranchData {
    pub id: BranchId,
    pub created_at: DateTime<Utc>,
@@ -212,39 +210,30 @@ impl CloudAdminApiClient {
            .await
            .expect("Semaphore is not closed");

-        let response = CloudAdminApiClient::with_retries(
-            || async {
-                let response = self
-                    .http_client
-                    .get(self.append_url("/projects"))
-                    .query(&[
-                        ("tenant_id", tenant_id.to_string()),
-                        ("show_deleted", "true".to_string()),
-                    ])
-                    .header(header::ACCEPT, "application/json")
-                    .bearer_auth(&self.token)
-                    .send()
-                    .await
-                    .map_err(|e| {
-                        Error::new(
-                            "Find project for tenant".to_string(),
-                            ErrorKind::RequestSend(e),
-                        )
-                    })?;
-
-                let response: AdminApiResponse<Vec<ProjectData>> =
-                    response.json().await.map_err(|e| {
-                        Error::new(
-                            "Find project for tenant".to_string(),
-                            ErrorKind::BodyRead(e),
-                        )
-                    })?;
-                Ok(response)
-            },
-            "find_tenant_project",
-        )
-        .await?;
+        let response = self
+            .http_client
+            .get(self.append_url("/projects"))
+            .query(&[
+                ("tenant_id", tenant_id.to_string()),
+                ("show_deleted", "true".to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| {
+                Error::new(
+                    "Find project for tenant".to_string(),
+                    ErrorKind::RequestSend(e),
+                )
+            })?;

+        let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
+            Error::new(
+                "Find project for tenant".to_string(),
+                ErrorKind::BodyRead(e),
+            )
+        })?;
        match response.data.len() {
            0 => Ok(None),
            1 => Ok(Some(
@@ -272,34 +261,42 @@ impl CloudAdminApiClient {
        const PAGINATION_LIMIT: usize = 512;
        let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
        loop {
-            let response_bytes = CloudAdminApiClient::with_retries(
-                || async {
-                    let response = self
-                        .http_client
-                        .get(self.append_url("/projects"))
-                        .query(&[
-                            ("show_deleted", "false".to_string()),
-                            ("limit", format!("{PAGINATION_LIMIT}")),
-                            ("offset", format!("{pagination_offset}")),
-                        ])
-                        .header(header::ACCEPT, "application/json")
-                        .bearer_auth(&self.token)
-                        .send()
-                        .await
-                        .map_err(|e| {
-                            Error::new(
-                                "List active projects".to_string(),
-                                ErrorKind::RequestSend(e),
-                            )
-                        })?;
+            let response = self
+                .http_client
+                .get(self.append_url("/projects"))
+                .query(&[
+                    ("show_deleted", "false".to_string()),
+                    ("limit", format!("{PAGINATION_LIMIT}")),
+                    ("offset", format!("{pagination_offset}")),
+                ])
+                .header(header::ACCEPT, "application/json")
+                .bearer_auth(&self.token)
+                .send()
+                .await
+                .map_err(|e| {
+                    Error::new(
+                        "List active projects".to_string(),
+                        ErrorKind::RequestSend(e),
+                    )
+                })?;

-                    response.bytes().await.map_err(|e| {
-                        Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
-                    })
-                },
-                "list_projects",
-            )
-            .await?;
+            match response.status() {
+                StatusCode::OK => {}
+                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
+                    tokio::time::sleep(Duration::from_millis(500)).await;
+                    continue;
+                }
+                _status => {
+                    return Err(Error::new(
+                        "List active projects".to_string(),
+                        ErrorKind::ResponseStatus(response.status()),
+                    ))
+                }
+            }
+
+            let response_bytes = response.bytes().await.map_err(|e| {
+                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
+            })?;

            let decode_result =
                serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -330,7 +327,6 @@ impl CloudAdminApiClient {

    pub async fn find_timeline_branch(
        &self,
-        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<Option<BranchData>, Error> {
        let _permit = self
@@ -339,61 +335,43 @@ impl CloudAdminApiClient {
            .await
            .expect("Semaphore is not closed");

-        let response = CloudAdminApiClient::with_retries(
-            || async {
-                let response = self
-                    .http_client
-                    .get(self.append_url("/branches"))
-                    .query(&[
-                        ("timeline_id", timeline_id.to_string()),
-                        ("show_deleted", "true".to_string()),
-                    ])
-                    .header(header::ACCEPT, "application/json")
-                    .bearer_auth(&self.token)
-                    .send()
-                    .await
-                    .map_err(|e| {
-                        Error::new(
-                            "Find branch for timeline".to_string(),
-                            ErrorKind::RequestSend(e),
-                        )
-                    })?;
+        let response = self
+            .http_client
+            .get(self.append_url("/branches"))
+            .query(&[
+                ("timeline_id", timeline_id.to_string()),
+                ("show_deleted", "true".to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| {
+                Error::new(
+                    "Find branch for timeline".to_string(),
+                    ErrorKind::RequestSend(e),
+                )
+            })?;

-                let response: AdminApiResponse<Vec<BranchData>> =
-                    response.json().await.map_err(|e| {
-                        Error::new(
-                            "Find branch for timeline".to_string(),
-                            ErrorKind::BodyRead(e),
-                        )
-                    })?;
-                Ok(response)
-            },
-            "find_timeline_branch",
-        )
-        .await?;
-
-        let mut branches: Vec<BranchData> = response.data.into_iter().collect();
-        // Normally timeline_id is unique. However, we do have at least one case
-        // of the same timeline_id in two different projects, apparently after
-        // manual recovery. So always recheck project_id (discovered through
-        // tenant_id).
-        let project_data = match self.find_tenant_project(tenant_id).await? {
-            Some(pd) => pd,
-            None => return Ok(None),
-        };
-        branches.retain(|b| b.project_id == project_data.id);
-        if branches.len() < 2 {
-            Ok(branches.first().cloned())
-        } else {
-            Err(Error::new(
-                format!(
-                    "Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
-                    tenant_id,
-                    timeline_id,
-                    branches.len()
-                ),
+        let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
+            Error::new(
+                "Find branch for timeline".to_string(),
+                ErrorKind::BodyRead(e),
+            )
+        })?;
+        match response.data.len() {
+            0 => Ok(None),
+            1 => Ok(Some(
+                response
+                    .data
+                    .into_iter()
+                    .next()
+                    .expect("Should have exactly one element"),
+            )),
+            too_many => Err(Error::new(
+                format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
                ErrorKind::UnexpectedState,
-            ))
+            )),
        }
    }

@@ -554,15 +532,4 @@ impl CloudAdminApiClient {
            .parse()
            .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
    }
-
-    async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
-    where
-        O: FnMut() -> F,
-        F: Future<Output = Result<T, Error>>,
-    {
-        let cancel = CancellationToken::new(); // not really used
-        backoff::retry(op, |_| false, 1, 20, description, &cancel)
-            .await
-            .expect("cancellations are disabled")
-    }
 }
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -60,7 +60,6 @@ pub struct GarbageList {
    /// see garbage, we saw some active tenants too.  This protects against classes of bugs
    /// in the scrubber that might otherwise generate a "deleted all" result.
    active_tenant_count: usize,
-    active_timeline_count: usize,
 }

 impl GarbageList {
@@ -68,7 +67,6 @@ impl GarbageList {
        Self {
            items: Vec::new(),
            active_tenant_count: 0,
-            active_timeline_count: 0,
            node_kind,
            bucket_config,
        }
@@ -121,10 +119,7 @@ pub async fn find_garbage(
 const S3_CONCURRENCY: usize = 32;

 // How many concurrent API requests to make to the console API.
-//
-// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
-// would be better to implement real rsp limiter.
-const CONSOLE_CONCURRENCY: usize = 16;
+const CONSOLE_CONCURRENCY: usize = 128;

 struct ConsoleCache {
    /// Set of tenants found in the control plane API
@@ -226,7 +221,6 @@ async fn find_garbage_inner(
        } else {
            tracing::debug!("Tenant {tenant_shard_id} is active");
            active_tenants.push(tenant_shard_id);
-            garbage.active_tenant_count = active_tenants.len();
        }

        counter += 1;
@@ -267,7 +261,7 @@ async fn find_garbage_inner(
        let api_client = cloud_admin_api_client.clone();
        async move {
            api_client
-                .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
+                .find_timeline_branch(ttid.timeline_id)
                .await
                .map_err(|e| anyhow::anyhow!(e))
                .map(|r| (ttid, r))
@@ -277,29 +271,15 @@ async fn find_garbage_inner(
        std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));

    // Update the GarbageList with any timelines which appear not to exist.
-    let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
    while let Some(result) = timelines_checked.next().await {
        let (ttid, console_result) = result?;
        if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
            tracing::debug!("Timeline {ttid} is garbage");
        } else {
            tracing::debug!("Timeline {ttid} is active");
-            active_timelines.push(ttid);
-            garbage.active_timeline_count = active_timelines.len();
        }
    }

-    let num_garbage_timelines = garbage
-        .items
-        .iter()
-        .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
-        .count();
-    tracing::info!(
-        "Found {}/{} garbage timelines in active tenants",
-        num_garbage_timelines,
-        active_timelines.len(),
-    );
-
    Ok(garbage)
 }

@@ -364,22 +344,16 @@ pub async fn get_timeline_objects(
 const MAX_KEYS_PER_DELETE: usize = 1000;

 /// Drain a buffer of keys into DeleteObjects requests
-///
-/// If `drain` is true, drains keys completely; otherwise stops when <
-/// MAX_KEYS_PER_DELETE keys are left.
-/// `num_deleted` returns number of deleted keys.
 async fn do_delete(
    s3_client: &Arc<Client>,
    bucket_name: &str,
    keys: &mut Vec<ObjectIdentifier>,
    dry_run: bool,
    drain: bool,
-    progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
    while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
        let request_keys =
            keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
-        let num_deleted = request_keys.len();
        if dry_run {
            tracing::info!("Dry-run deletion of objects: ");
            for k in request_keys {
@@ -394,30 +368,12 @@ async fn do_delete(
                .send()
                .await
                .context("DeleteObjects request")?;
-            progress_tracker.register(num_deleted);
        }
    }

    Ok(())
 }

-/// Simple tracker reporting each 10k deleted keys.
-#[derive(Default)]
-struct DeletionProgressTracker {
-    num_deleted: usize,
-    last_reported_num_deleted: usize,
-}
-
-impl DeletionProgressTracker {
-    fn register(&mut self, n: usize) {
-        self.num_deleted += n;
-        if self.num_deleted - self.last_reported_num_deleted > 10000 {
-            tracing::info!("progress: deleted {} keys", self.num_deleted);
-            self.last_reported_num_deleted = self.num_deleted;
-        }
-    }
-}
-
 pub async fn purge_garbage(
    input_path: String,
    mode: PurgeMode,
@@ -438,14 +394,6 @@ pub async fn purge_garbage(
    if garbage_list.active_tenant_count == 0 {
        anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
    }
-    if garbage_list
-        .items
-        .iter()
-        .any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
-        && garbage_list.active_timeline_count == 0
-    {
-        anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
-    }

    let filtered_items = garbage_list
        .items
@@ -481,7 +429,6 @@ pub async fn purge_garbage(
        std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));

    let mut objects_to_delete = Vec::new();
-    let mut progress_tracker = DeletionProgressTracker::default();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
        objects_to_delete.append(&mut object_list);
@@ -492,7 +439,6 @@ pub async fn purge_garbage(
                &mut objects_to_delete,
                dry_run,
                false,
-                &mut progress_tracker,
            )
            .await?;
        }
@@ -504,11 +450,10 @@ pub async fn purge_garbage(
        &mut objects_to_delete,
        dry_run,
        true,
-        &mut progress_tracker,
    )
    .await?;

-    tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
+    tracing::info!("Fell through");

    Ok(())
 }
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,8 +4,7 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
-pub mod scan_pageserver_metadata;
-pub mod scan_safekeeper_metadata;
+pub mod scan_metadata;
 pub mod tenant_snapshot;

 use std::env;
@@ -142,17 +141,12 @@ impl RootTarget {
    pub fn tenants_root(&self) -> S3Target {
        match self {
            Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.clone(),
+            Self::Safekeeper(root) => root.with_sub_segment("wal"),
        }
    }

    pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
-        match self {
-            Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
-            Self::Safekeeper(_) => self
-                .tenants_root()
-                .with_sub_segment(&tenant_id.tenant_id.to_string()),
-        }
+        self.tenants_root().with_sub_segment(&tenant_id.to_string())
    }

    pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
@@ -343,7 +337,9 @@ fn init_remote(
        }),
        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
+            prefix_in_bucket: bucket_config
+                .prefix_in_bucket
+                .unwrap_or("safekeeper/v1".to_string()),
            delimiter,
        }),
    };
@@ -368,10 +364,7 @@ async fn list_objects_with_retries(
        {
            Ok(response) => return Ok(response),
            Err(e) => {
-                error!(
-                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
-                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
-                );
+                error!("list_objects_v2 query failed: {e}");
                tokio::time::sleep(Duration::from_secs(1)).await;
            }
        }
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,13 +1,9 @@
-use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::scan_pageserver_metadata::scan_metadata;
+use s3_scrubber::scan_metadata::scan_metadata;
 use s3_scrubber::tenant_snapshot::SnapshotDownloader;
-use s3_scrubber::{
-    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
-    NodeKind, TraversingDepth,
-};
+use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};

 use clap::{Parser, Subcommand};
 use utils::id::TenantId;
@@ -39,20 +35,11 @@ enum Command {
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
    },
-    #[command(verbatim_doc_comment)]
    ScanMetadata {
-        #[arg(short, long)]
-        node_kind: NodeKind,
        #[arg(short, long, default_value_t = false)]
        json: bool,
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
-        #[arg(long, default_value = None)]
-        /// For safekeeper node_kind only, points to db with debug dump
-        dump_db_connstr: Option<String>,
-        /// For safekeeper node_kind only, table in the db with debug dump
-        #[arg(long, default_value = None)]
-        dump_db_table: Option<String>,
    },
    TenantSnapshot {
        #[arg(long = "tenant-id")]
@@ -85,75 +72,33 @@ async fn main() -> anyhow::Result<()> {
    ));

    match cli.command {
-        Command::ScanMetadata {
-            json,
-            tenant_ids,
-            node_kind,
-            dump_db_connstr,
-            dump_db_table,
-        } => {
-            if let NodeKind::Safekeeper = node_kind {
-                let dump_db_connstr =
-                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
-                let dump_db_table =
-                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
-
-                let summary = scan_safekeeper_metadata(
-                    bucket_config.clone(),
-                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
-                    dump_db_connstr,
-                    dump_db_table,
-                )
-                .await?;
-                if json {
-                    println!("{}", serde_json::to_string(&summary).unwrap())
-                } else {
-                    println!("{}", summary.summary_string());
+        Command::ScanMetadata { json, tenant_ids } => {
+            match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                Err(e) => {
+                    tracing::error!("Failed: {e}");
+                    Err(e)
                }
-                if summary.is_fatal() {
-                    bail!("Fatal scrub errors detected");
-                }
-                if summary.is_empty() {
-                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                    // scrubber they were likely expecting to scan something, and if we see no timelines
-                    // at all then it's likely due to some configuration issues like a bad prefix
-                    bail!(
-                        "No timelines found in bucket {} prefix {}",
-                        bucket_config.bucket,
-                        bucket_config
-                            .prefix_in_bucket
-                            .unwrap_or("<none>".to_string())
-                    );
-                }
-                Ok(())
-            } else {
-                match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                    Err(e) => {
-                        tracing::error!("Failed: {e}");
-                        Err(e)
+                Ok(summary) => {
+                    if json {
+                        println!("{}", serde_json::to_string(&summary).unwrap())
+                    } else {
+                        println!("{}", summary.summary_string());
                    }
-                    Ok(summary) => {
-                        if json {
-                            println!("{}", serde_json::to_string(&summary).unwrap())
-                        } else {
-                            println!("{}", summary.summary_string());
-                        }
-                        if summary.is_fatal() {
-                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                        } else if summary.is_empty() {
-                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                            // scrubber they were likely expecting to scan something, and if we see no timelines
-                            // at all then it's likely due to some configuration issues like a bad prefix
-                            Err(anyhow::anyhow!(
-                                "No timelines found in bucket {} prefix {}",
-                                bucket_config.bucket,
-                                bucket_config
-                                    .prefix_in_bucket
-                                    .unwrap_or("<none>".to_string())
-                            ))
-                        } else {
-                            Ok(())
-                        }
+                    if summary.is_fatal() {
+                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                    } else if summary.is_empty() {
+                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                        // scrubber they were likely expecting to scan something, and if we see no timelines
+                        // at all then it's likely due to some configuration issues like a bad prefix
+                        Err(anyhow::anyhow!(
+                            "No timelines found in bucket {} prefix {}",
+                            bucket_config.bucket,
+                            bucket_config
+                                .prefix_in_bucket
+                                .unwrap_or("<none>".to_string())
+                        ))
+                    } else {
+                        Ok(())
                    }
                }
            }
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -114,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>(
    let timelines_target = target.timelines_root(&tenant);

    loop {
-        tracing::debug!("Listing in {}", tenant);
+        tracing::info!("Listing in {}", tenant);
        let fetch_response =
            list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
                .await;
@@ -151,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>(
        }
    }

-    tracing::debug!("Yielding for {}", tenant);
+    tracing::info!("Yielding for {}", tenant);
    Ok(stream! {
        for i in timeline_ids {
            let id = i?;
--- a/s3_scrubber/src/scan_pageserver_metadata.rs
+++ b/s3_scrubber/src/scan_pageserver_metadata.rs
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,236 +0,0 @@
-use std::{collections::HashSet, str::FromStr};
-
-use aws_sdk_s3::Client;
-use futures::stream::{StreamExt, TryStreamExt};
-use pageserver_api::shard::TenantShardId;
-use postgres_ffi::{XLogFileName, PG_TLI};
-use serde::Serialize;
-use tokio_postgres::types::PgLsn;
-use tracing::{error, info, trace};
-use utils::{
-    id::{TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-};
-
-use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
-};
-
-/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
-const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
-
-#[derive(Serialize)]
-pub struct MetadataSummary {
-    timeline_count: usize,
-    with_errors: HashSet<TenantTimelineId>,
-    deleted_count: usize,
-}
-
-impl MetadataSummary {
-    fn new() -> Self {
-        Self {
-            timeline_count: 0,
-            with_errors: HashSet::new(),
-            deleted_count: 0,
-        }
-    }
-
-    pub fn summary_string(&self) -> String {
-        format!(
-            "timeline_count: {}, with_errors: {}",
-            self.timeline_count,
-            self.with_errors.len()
-        )
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.timeline_count == 0
-    }
-
-    pub fn is_fatal(&self) -> bool {
-        !self.with_errors.is_empty()
-    }
-}
-
-/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
-/// statistics.
-///
-/// It works by listing timelines along with timeline_start_lsn and backup_lsn
-/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
-/// segments are missing, before complaining control plane is queried to check if
-/// the project wasn't deleted in the meanwhile.
-pub async fn scan_safekeeper_metadata(
-    bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantId>,
-    dump_db_connstr: String,
-    dump_db_table: String,
-) -> anyhow::Result<MetadataSummary> {
-    info!(
-        "checking bucket {}, region {}, dump_db_table {}",
-        bucket_config.bucket, bucket_config.region, dump_db_table
-    );
-    // Use the native TLS implementation (Neon requires TLS)
-    let tls_connector =
-        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
-    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
-    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    let tenant_filter_clause = if !tenant_ids.is_empty() {
-        format!(
-            "and tenant_id in ({})",
-            tenant_ids
-                .iter()
-                .map(|t| format!("'{}'", t))
-                .collect::<Vec<_>>()
-                .join(", ")
-        )
-    } else {
-        "".to_owned()
-    };
-    let query = format!(
-        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
-        dump_db_table, tenant_filter_clause,
-    );
-    info!("query is {}", query);
-    let timelines = client.query(&query, &[]).await?;
-    info!("loaded {} timelines", timelines.len());
-
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
-    let console_config = ConsoleConfig::from_env()?;
-    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
-
-    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
-        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
-        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
-        let timeline_start_lsn_pg: PgLsn = row.get(2);
-        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
-        let backup_lsn_pg: PgLsn = row.get(3);
-        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
-        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-        check_timeline(
-            &s3_client,
-            &target,
-            &cloud_admin_api_client,
-            ttid,
-            timeline_start_lsn,
-            backup_lsn,
-        )
-    });
-    // Run multiple check_timeline's concurrently.
-    const CONCURRENCY: usize = 32;
-    let mut timelines = checks.try_buffered(CONCURRENCY);
-
-    let mut summary = MetadataSummary::new();
-    while let Some(r) = timelines.next().await {
-        let res = r?;
-        summary.timeline_count += 1;
-        if !res.is_ok {
-            summary.with_errors.insert(res.ttid);
-        }
-        if res.is_deleted {
-            summary.deleted_count += 1;
-        }
-    }
-
-    Ok(summary)
-}
-
-struct TimelineCheckResult {
-    ttid: TenantTimelineId,
-    is_ok: bool,
-    is_deleted: bool, // timeline is deleted in cplane
-}
-
-/// List s3 and check that is has all expected WAL for the ttid. Consistency
-/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
-/// Ok(false) if not, Err if failed to check.
-async fn check_timeline(
-    s3_client: &Client,
-    root: &RootTarget,
-    api_client: &CloudAdminApiClient,
-    ttid: TenantTimelineId,
-    timeline_start_lsn: Lsn,
-    backup_lsn: Lsn,
-) -> anyhow::Result<TimelineCheckResult> {
-    trace!(
-        "checking ttid {}, should contain WAL [{}-{}]",
-        ttid,
-        timeline_start_lsn,
-        backup_lsn
-    );
-    // calculate expected segfiles
-    let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
-    let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
-    let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
-        (expected_first_segno..expected_last_segno)
-            .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
-    );
-    let expected_files_num = expected_segfiles.len();
-    trace!("expecting {} files", expected_segfiles.len(),);
-
-    // now list s3 and check if it misses something
-    let ttshid =
-        TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
-    let mut timeline_dir_target = root.timeline_root(&ttshid);
-    // stream_listing yields only common_prefixes if delimiter is not empty, but
-    // we need files, so unset it.
-    timeline_dir_target.delimiter = String::new();
-
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
-
-        let seg_name = key
-            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
-            .expect("failed to extract segment name");
-        expected_segfiles.remove(seg_name);
-    }
-    if !expected_segfiles.is_empty() {
-        // Before complaining check cplane, probably timeline is already deleted.
-        let bdata = api_client
-            .find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
-            .await?;
-        let deleted = match bdata {
-            Some(bdata) => bdata.deleted,
-            None => {
-                // note: should be careful with selecting proper cplane address
-                info!("ttid {} not found, assuming it is deleted", ttid);
-                true
-            }
-        };
-        if deleted {
-            // ok, branch is deleted
-            return Ok(TimelineCheckResult {
-                ttid,
-                is_ok: true,
-                is_deleted: true,
-            });
-        }
-        error!(
-            "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
-            ttid,
-            expected_segfiles.len(),
-            expected_files_num,
-            timeline_start_lsn,
-            backup_lsn,
-        );
-        return Ok(TimelineCheckResult {
-            ttid,
-            is_ok: false,
-            is_deleted: false,
-        });
-    }
-    Ok(TimelineCheckResult {
-        ttid,
-        is_ok: true,
-        is_deleted: false,
-    })
-}
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -177,10 +177,6 @@ struct Args {
    /// Controls how long backup will wait until uploading the partial segment.
    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
    partial_backup_timeout: Duration,
-    /// Disable task to push messages to broker every second. Supposed to
-    /// be used in tests.
-    #[arg(long)]
-    disable_periodic_broker_push: bool,
 }

 // Like PathBufValueParser, but allows empty string.
@@ -313,7 +309,6 @@ async fn main() -> anyhow::Result<()> {
        walsenders_keep_horizon: args.walsenders_keep_horizon,
        partial_backup_enabled: args.partial_backup_enabled,
        partial_backup_timeout: args.partial_backup_timeout,
-        disable_periodic_broker_push: args.disable_periodic_broker_push,
    };

    // initialize sentry if SENTRY_DSN is provided
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -10,20 +10,11 @@ use anyhow::Result;
 use storage_broker::parse_proto_ttid;

 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
-use storage_broker::proto::FilterTenantTimelineId;
-use storage_broker::proto::MessageType;
-use storage_broker::proto::SafekeeperDiscoveryResponse;
-use storage_broker::proto::SubscribeByFilterRequest;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
-use storage_broker::proto::TypeSubscription;
-use storage_broker::proto::TypedMessage;
 use storage_broker::Request;

-use std::sync::atomic::AtomicU64;
-use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
-use std::time::UNIX_EPOCH;
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use tracing::*;
@@ -40,12 +31,6 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;

 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    if conf.disable_periodic_broker_push {
-        info!("broker push_loop is disabled, doing nothing...");
-        futures::future::pending::<()>().await; // sleep forever
-        return Ok(());
-    }
-
    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -90,7 +75,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }

 /// Subscribe and fetch all the interesting data from the broker.
-async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;

    // TODO: subscribe only to local timelines instead of all
@@ -109,8 +94,6 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
    let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);

    while let Some(msg) = stream.message().await? {
-        stats.update_pulled();
-
        let proto_ttid = msg
            .tenant_timeline_id
            .as_ref()
@@ -136,93 +119,12 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
    bail!("end of stream");
 }

-/// Process incoming discover requests. This is done in a separate task to avoid
-/// interfering with the normal pull/push loops.
-async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
-    let mut client =
-        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
-
-    let request = SubscribeByFilterRequest {
-        types: vec![TypeSubscription {
-            r#type: MessageType::SafekeeperDiscoveryRequest as i32,
-        }],
-        tenant_timeline_id: Some(FilterTenantTimelineId {
-            enabled: false,
-            tenant_timeline_id: None,
-        }),
-    };
-
-    let mut stream = client
-        .subscribe_by_filter(request)
-        .await
-        .context("subscribe_by_filter request failed")?
-        .into_inner();
-
-    let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
-
-    while let Some(typed_msg) = stream.message().await? {
-        stats.update_pulled();
-
-        match typed_msg.r#type() {
-            MessageType::SafekeeperDiscoveryRequest => {
-                let msg = typed_msg
-                    .safekeeper_discovery_request
-                    .expect("proto type mismatch from broker message");
-
-                let proto_ttid = msg
-                    .tenant_timeline_id
-                    .as_ref()
-                    .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
-                let ttid = parse_proto_ttid(proto_ttid)?;
-                if let Ok(tli) = GlobalTimelines::get(ttid) {
-                    // we received a discovery request for a timeline we know about
-                    discover_counter.inc();
-
-                    // create and reply with discovery response
-                    let sk_info = tli.get_safekeeper_info(&conf).await;
-                    let response = SafekeeperDiscoveryResponse {
-                        safekeeper_id: sk_info.safekeeper_id,
-                        tenant_timeline_id: sk_info.tenant_timeline_id,
-                        commit_lsn: sk_info.commit_lsn,
-                        safekeeper_connstr: sk_info.safekeeper_connstr,
-                        availability_zone: sk_info.availability_zone,
-                    };
-
-                    // note this is a blocking call
-                    client
-                        .publish_one(TypedMessage {
-                            r#type: MessageType::SafekeeperDiscoveryResponse as i32,
-                            safekeeper_timeline_info: None,
-                            safekeeper_discovery_request: None,
-                            safekeeper_discovery_response: Some(response),
-                        })
-                        .await?;
-                }
-            }
-
-            _ => {
-                warn!(
-                    "unexpected message type i32 {}, {:?}",
-                    typed_msg.r#type,
-                    typed_msg.r#type()
-                );
-            }
-        }
-    }
-    bail!("end of stream");
-}
-
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    info!("started, broker endpoint {:?}", conf.broker_endpoint);

    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
    let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
    let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
-    let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
-
-    let stats = Arc::new(BrokerStats::new());
-    let stats_task = task_stats(stats.clone());
-    tokio::pin!(stats_task);

    // Selecting on JoinHandles requires some squats; is there a better way to
    // reap tasks individually?
@@ -251,77 +153,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                    };
                    pull_handle = None;
                },
-                res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
-                    // was it panic or normal error?
-                    match res {
-                        Ok(res_internal) => if let Err(err_inner) = res_internal {
-                            warn!("discover task failed: {:?}", err_inner);
-                        }
-                        Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
-                    };
-                    discover_handle = None;
-                },
                _ = ticker.tick() => {
                    if push_handle.is_none() {
                        push_handle = Some(tokio::spawn(push_loop(conf.clone())));
                    }
                    if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
                    }
-                    if discover_handle.is_none() {
-                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
-                    }
-                },
-                _ = &mut stats_task => {}
-        }
-    }
-}
-
-struct BrokerStats {
-    /// Timestamp of the last received message from the broker.
-    last_pulled_ts: AtomicU64,
-}
-
-impl BrokerStats {
-    fn new() -> Self {
-        BrokerStats {
-            last_pulled_ts: AtomicU64::new(0),
-        }
-    }
-
-    fn now_millis() -> u64 {
-        std::time::SystemTime::now()
-            .duration_since(UNIX_EPOCH)
-            .expect("time is before epoch")
-            .as_millis() as u64
-    }
-
-    /// Update last_pulled timestamp to current time.
-    fn update_pulled(&self) {
-        self.last_pulled_ts
-            .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-/// Periodically write to logs if there are issues with receiving data from the broker.
-async fn task_stats(stats: Arc<BrokerStats>) {
-    let warn_duration = Duration::from_secs(10);
-    let mut ticker = tokio::time::interval(warn_duration);
-
-    loop {
-        tokio::select! {
-            _ = ticker.tick() => {
-                let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
-                if last_pulled == 0 {
-                    // no broker updates yet
-                    continue;
-                }
-
-                let now = BrokerStats::now_millis();
-                if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
-                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
-                    info!("no broker updates for some time, last update: {:?}", ts);
-                }
            }
        }
    }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -83,7 +83,6 @@ pub struct SafeKeeperConf {
    pub walsenders_keep_horizon: bool,
    pub partial_backup_enabled: bool,
    pub partial_backup_timeout: Duration,
-    pub disable_periodic_broker_push: bool,
 }

 impl SafeKeeperConf {
@@ -130,7 +129,6 @@ impl SafeKeeperConf {
            walsenders_keep_horizon: false,
            partial_backup_enabled: false,
            partial_backup_timeout: Duration::from_secs(0),
-            disable_periodic_broker_push: false,
        }
    }
 }
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,7 +178,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        walsenders_keep_horizon: false,
        partial_backup_enabled: false,
        partial_backup_timeout: Duration::from_secs(0),
-        disable_periodic_broker_push: false,
    };

    let mut global = GlobalMap::new(disk, conf.clone())?;
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -196,13 +196,8 @@ impl SubscriptionKey {

    /// Parse from FilterTenantTimelineId
    pub fn from_proto_filter_tenant_timeline_id(
-        opt: Option<&FilterTenantTimelineId>,
+        f: &FilterTenantTimelineId,
    ) -> Result<Self, Status> {
-        if opt.is_none() {
-            return Ok(SubscriptionKey::All);
-        }
-
-        let f = opt.unwrap();
        if !f.enabled {
            return Ok(SubscriptionKey::All);
        }
@@ -539,7 +534,10 @@ impl BrokerService for Broker {
            .remote_addr()
            .expect("TCPConnectInfo inserted by handler");
        let proto_filter = request.into_inner();
-        let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
+        let ttid_filter = proto_filter
+            .tenant_timeline_id
+            .as_ref()
+            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;

        let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
        let types_set = proto_filter
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -90,11 +90,7 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

-/// How long a node may be unresponsive to heartbeats before we declare it offline.
-/// This must be long enough to cover node restarts as well as normal operations: in future
-/// it should be separated into distinct timeouts for startup vs. normal operation
-/// (`<https://github.com/neondatabase/neon/issues/7552>`)
-pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);

 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;

@@ -4255,9 +4251,7 @@ impl Service {
    /// Check all tenants for pending reconciliation work, and reconcile those in need.
    /// Additionally, reschedule tenants that require it.
    ///
-    /// Returns how many reconciliation tasks were started, or `1` if no reconciles were
-    /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
-    /// available.  A return value of 0 indicates that everything is fully reconciled already.
+    /// Returns how many reconciliation tasks were started
    fn reconcile_all(&self) -> usize {
        let mut locked = self.inner.write().unwrap();
        let (nodes, tenants, _scheduler) = locked.parts_mut();
@@ -4272,11 +4266,7 @@ impl Service {
            }

            // Skip checking if this shard is already enqueued for reconciliation
-            if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
-                // If there is something delayed, then return a nonzero count so that
-                // callers like reconcile_all_now do not incorrectly get the impression
-                // that the system is in a quiescent state.
-                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+            if shard.delayed_reconcile {
                continue;
            }

@@ -4461,7 +4451,7 @@ impl Service {
            waiter_count
        );

-        Ok(std::cmp::max(waiter_count, reconciles_spawned))
+        Ok(waiter_count)
    }

    pub async fn shutdown(&self) {
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -952,8 +952,8 @@ impl TenantShard {

    /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
    ///
-    /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
-    /// you would like to wait on the next reconciler that gets spawned in the background.
+    /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
+    /// you would like to wait until one gets spawned in the background.
    pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
        self.ensure_sequence_ahead();

--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -14,18 +14,10 @@ class ComputeReconfigure:
        self.server = server
        self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
        self.workloads = {}
-        self.on_notify = None

    def register_workload(self, workload):
        self.workloads[workload.tenant_id] = workload

-    def register_on_notify(self, fn):
-        """
-        Add some extra work during a notification, like sleeping to slow things down, or
-        logging what was notified.
-        """
-        self.on_notify = fn
-

@pytest.fixture(scope="function")
 def compute_reconfigure_listener(make_httpserver):
@@ -51,9 +43,6 @@ def compute_reconfigure_listener(make_httpserver):
        body: dict[str, Any] = request.json
        log.info(f"notify-attach request: {body}")

-        if self.on_notify is not None:
-            self.on_notify(body)
-
        try:
            workload = self.workloads[TenantId(body["tenant_id"])]
        except KeyError:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -499,7 +499,6 @@ class NeonEnvBuilder:
        self.config_init_force: Optional[str] = None
        self.top_output_dir = top_output_dir
        self.control_plane_compute_hook_api: Optional[str] = None
-        self.storage_controller_config: Optional[dict[Any, Any]] = None

        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine

@@ -1022,7 +1021,6 @@ class NeonEnv:
        self.pg_distrib_dir = config.pg_distrib_dir
        self.endpoint_counter = 0
        self.pageserver_config_override = config.pageserver_config_override
-        self.storage_controller_config = config.storage_controller_config

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
@@ -1068,9 +1066,6 @@ class NeonEnv:
        if self.control_plane_compute_hook_api is not None:
            cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api

-        if self.storage_controller_config is not None:
-            cfg["storage_controller"] = self.storage_controller_config
-
        # Create config for pageserver
        http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
        pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1139,9 +1134,12 @@ class NeonEnv:
        # bounce through retries on startup
        self.storage_controller.start()

+        def storage_controller_ready():
+            assert self.storage_controller.ready() is True
+
        # Wait for storage controller readiness to prevent unnecessary post start-up
        # reconcile.
-        self.storage_controller.wait_until_ready()
+        wait_until(30, 1, storage_controller_ready)

        # Start up broker, pageserver and all safekeepers
        futs = []
@@ -2045,15 +2043,6 @@ class NeonStorageController(MetricsGetter):
        else:
            raise RuntimeError(f"Unexpected status {status} from readiness endpoint")

-    def wait_until_ready(self):
-        t1 = time.time()
-
-        def storage_controller_ready():
-            assert self.ready() is True
-
-        wait_until(30, 1, storage_controller_ready)
-        return time.time() - t1
-
    def attach_hook_issue(
        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
    ) -> int:
@@ -2141,7 +2130,7 @@ class NeonStorageController(MetricsGetter):
        shard_count: Optional[int] = None,
        shard_stripe_size: Optional[int] = None,
        tenant_config: Optional[Dict[Any, Any]] = None,
-        placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
+        placement_policy: Optional[str] = None,
    ):
        """
        Use this rather than pageserver_api() when you need to include shard parameters
@@ -2251,21 +2240,10 @@ class NeonStorageController(MetricsGetter):
    def reconcile_until_idle(self, timeout_secs=30):
        start_at = time.time()
        n = 1
-        delay_sec = 0.5
-        delay_max = 5
        while n > 0:
            n = self.reconcile_all()
-            if n == 0:
-                break
-            elif time.time() - start_at > timeout_secs:
+            if time.time() - start_at > timeout_secs:
                raise RuntimeError("Timeout in reconcile_until_idle")
-            else:
-                # Don't call again right away: if we're waiting for many reconciles that
-                # are blocked on the concurrency limit, it slows things down to call
-                # reconcile_all frequently.
-                time.sleep(delay_sec)
-                delay_sec *= 2
-                delay_sec = min(delay_sec, delay_max)

    def consistency_check(self):
        """
@@ -3756,9 +3734,7 @@ class S3Scrubber:
        return stdout

    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(
-            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
-        )
+        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)

        try:
            return json.loads(stdout)
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -1,198 +0,0 @@
-import concurrent.futures
-import random
-import time
-
-import pytest
-from fixtures.compute_reconfigure import ComputeReconfigure
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
-from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TenantShardId, TimelineId
-
-
-@pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
-def test_storage_controller_many_tenants(
-    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
-):
-    """
-    Check that we cope well with a not-totally-trivial number of tenants.
-
-    This is checking for:
-    - Obvious concurrency bugs from issuing many tenant creations/modifications
-      concurrently.
-    - Obvious scaling bugs like O(N^2) scaling that would be so slow that even
-      a basic test starts failing from slowness.
-
-    This is _not_ a comprehensive scale test: just a basic sanity check that
-    we don't fall over for a thousand shards.
-    """
-
-    neon_env_builder.num_pageservers = 5
-    neon_env_builder.storage_controller_config = {
-        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
-        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
-        # guard against regressions in restart time.
-        "max_unavailable": "300s"
-    }
-    neon_env_builder.control_plane_compute_hook_api = (
-        compute_reconfigure_listener.control_plane_compute_hook_api
-    )
-
-    # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
-    compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
-
-    env = neon_env_builder.init_start()
-
-    # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
-    # of shards are hitting the delayed path.
-    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
-
-    for ps in env.pageservers:
-        # This can happen because when we do a loop over all pageservers and mark them offline/active,
-        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
-        # bumping generation before other attachments are detached.
-        #
-        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
-        # we spawn with a wait for the predecessor.
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
-        # Storage controller is allowed to drop pageserver requests when the cancellation token
-        # for a Reconciler fires.
-        ps.allowed_errors.append(".*request was dropped before completing.*")
-
-    # Total tenants
-    tenant_count = 4000
-
-    # Shards per tenant
-    shard_count = 2
-    stripe_size = 1024
-
-    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
-
-    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
-
-    def check_memory():
-        # Shards should be cheap_ in memory, as we will have very many of them
-        expect_memory_per_shard = 128 * 1024
-
-        rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
-        assert rss is not None
-        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
-        assert rss < expect_memory_per_shard * shard_count * tenant_count
-
-    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
-    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
-    rng = random.Random(1234)
-
-    # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
-    # permits, to ensure that we are exercising stressing that.
-    api_concurrency = 135
-
-    # We will create tenants directly via API, not via neon_local, to avoid any false
-    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
-    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
-        futs = []
-        t1 = time.time()
-        for tenant_id in tenants:
-            f = executor.submit(
-                env.storage_controller.tenant_create,
-                tenant_id,
-                shard_count,
-                stripe_size,
-                placement_policy={"Attached": 1},
-            )
-            futs.append(f)
-
-        # Wait for creations to finish
-        for f in futs:
-            f.result()
-        log.info(
-            f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
-        )
-
-        run_ops = api_concurrency * 4
-        assert run_ops < len(tenants)
-        op_tenants = list(tenants)[0:run_ops]
-
-        # Generate a mixture of operations and dispatch them all concurrently
-        futs = []
-        for tenant_id in op_tenants:
-            op = rng.choice([0, 1, 2])
-            if op == 0:
-                # A fan-out write operation to all shards in a tenant (timeline creation)
-                f = executor.submit(
-                    virtual_ps_http.timeline_create,
-                    PgVersion.NOT_SET,
-                    tenant_id,
-                    TimelineId.generate(),
-                )
-            elif op == 1:
-                # A reconciler operation: migrate a shard.
-                shard_number = rng.randint(0, shard_count - 1)
-                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
-                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
-                f = executor.submit(
-                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
-                )
-            elif op == 2:
-                # A passthrough read to shard zero
-                f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
-
-            futs.append(f)
-
-        # Wait for mixed ops to finish
-        for f in futs:
-            f.result()
-
-    # Consistency check is safe here: all the previous operations waited for reconcile before completing
-    env.storage_controller.consistency_check()
-    check_memory()
-
-    # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
-    # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
-    # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
-    #
-    # We do not require that the system is quiescent already here, although at present in this point in the test
-    # that may be the case.
-    while True:
-        t1 = time.time()
-        reconcilers = env.storage_controller.reconcile_all()
-        if reconcilers == 0:
-            # Time how long a no-op background reconcile takes: this measures how long it takes to
-            # loop over all the shards looking for work to do.
-            runtime = time.time() - t1
-            log.info(f"No-op call to reconcile_all took {runtime}s")
-            assert runtime < 1
-            break
-
-    # Restart the storage controller
-    env.storage_controller.stop()
-    env.storage_controller.start()
-
-    # See how long the controller takes to pass its readiness check.  This should be fast because
-    # all the nodes are online: offline pageservers are the only thing that's allowed to delay
-    # startup.
-    readiness_period = env.storage_controller.wait_until_ready()
-    assert readiness_period < 5
-
-    # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
-    # to run, as it was in a stable state before restart.  If it did, that's a bug.
-    env.storage_controller.consistency_check()
-    check_memory()
-
-    # Restart pageservers: this exercises the /re-attach API
-    for pageserver in env.pageservers:
-        pageserver.stop()
-        pageserver.start()
-
-    # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
-    # as they were not offline long enough to trigger any scheduling changes.
-    env.storage_controller.consistency_check()
-    check_memory()
-
-    # Stop the storage controller before tearing down fixtures, because it otherwise might log
-    # errors trying to call our `ComputeReconfigure`.
-    env.storage_controller.stop()
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -928,8 +928,6 @@ def test_sharding_split_failures(
            ".*Reconcile error: receive body: error sending request for url.*",
            # Node offline cases will fail inside reconciler when detaching secondaries
            ".*Reconcile error on shard.*: receive body: error sending request for url.*",
-            # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline
-            ".*Reconcile error.*Cancelled.*",
            # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
            ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
        ]
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

    tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_idle_reconnections")
+    timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint")

    def collect_stats() -> Dict[str, float]:
        # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
@@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):

    collect_stats()

-    endpoint = env.endpoints.create_start("test_idle_reconnections")
+    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
    # just write something to the timeline
    endpoint.safe_psql("create table t(i int)")
    collect_stats()
@@ -2007,47 +2007,3 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
    )
    log.info(f"dump_control_file response: {res}")
    assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
-
-
-# Test disables periodic pushes from safekeeper to the broker and checks that
-# pageserver can still discover safekeepers with discovery requests.
-def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.num_safekeepers = 3
-    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start()
-
-    env.neon_cli.create_branch("test_broker_discovery")
-
-    endpoint = env.endpoints.create_start(
-        "test_broker_discovery",
-        config_lines=["shared_buffers=1MB"],
-    )
-    endpoint.safe_psql("create table t(i int, payload text)")
-    # Install extension containing function needed to clear buffer
-    endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
-
-    def do_something():
-        time.sleep(1)
-        # generate some data to commit WAL on safekeepers
-        endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
-        # clear the buffers
-        endpoint.safe_psql("select clear_buffer_cache()")
-        # read data to fetch pages from pageserver
-        endpoint.safe_psql("select sum(i) from t")
-
-    do_something()
-    do_something()
-
-    for sk in env.safekeepers:
-        # Disable periodic broker push, so pageserver won't be able to discover
-        # safekeepers without sending a discovery request
-        sk.stop().start(extra_opts=["--disable-periodic-broker-push"])
-
-    do_something()
-    do_something()
-
-    # restart pageserver and check how everything works
-    env.pageserver.stop().start()
-
-    do_something()
-    do_something()