Revised fragmentation logic

pull key diff into helper
tests: add test_sharding_compaction
2026-05-22 23:50:39 +00:00 · 2024-04-25 18:15:26 +01:00 · 2024-04-25 18:05:59 +01:00 · 2024-04-25 18:05:59 +01:00 · 2024-04-25 18:05:59 +01:00 · 2024-04-25 18:04:23 +01:00
107 changed files with 1802 additions and 5615 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -478,7 +478,6 @@ jobs:
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -558,9 +557,6 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: false
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -722,9 +722,9 @@ dependencies = [

 [[package]]
 name = "azure_core"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
+checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
 dependencies = [
 "async-trait",
 "base64 0.21.1",
@@ -752,9 +752,9 @@ dependencies = [

 [[package]]
 name = "azure_identity"
-version = "0.19.0"
+version = "0.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
+checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
 dependencies = [
 "async-lock",
 "async-trait",
@@ -772,9 +772,9 @@ dependencies = [

 [[package]]
 name = "azure_storage"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
+checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
 dependencies = [
 "RustyXML",
 "async-lock",
@@ -791,9 +791,9 @@ dependencies = [

 [[package]]
 name = "azure_storage_blobs"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
+checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
 dependencies = [
 "RustyXML",
 "azure_core",
@@ -812,9 +812,9 @@ dependencies = [

 [[package]]
 name = "azure_svc_blobstorage"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
+checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
 dependencies = [
 "azure_core",
 "bytes",
@@ -1319,7 +1319,6 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "humantime-serde",
 "hyper 0.14.26",
 "nix 0.27.1",
 "once_cell",
@@ -2764,9 +2763,9 @@ dependencies = [

 [[package]]
 name = "js-sys"
-version = "0.3.69"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
 dependencies = [
 "wasm-bindgen",
 ]
@@ -3185,16 +3184,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3531,12 +3520,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -5102,7 +5085,6 @@ dependencies = [
 "aws-smithy-async",
 "bincode",
 "bytes",
- "camino",
 "chrono",
 "clap",
 "crc32c",
@@ -5112,11 +5094,8 @@ dependencies = [
 "hex",
 "histogram",
 "itertools",
- "native-tls",
 "pageserver",
 "pageserver_api",
- "postgres-native-tls",
- "postgres_ffi",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -5125,10 +5104,8 @@ dependencies = [
 "serde_with",
 "thiserror",
 "tokio",
- "tokio-postgres",
 "tokio-rustls 0.25.0",
 "tokio-stream",
- "tokio-util",
 "tracing",
 "tracing-appender",
 "tracing-subscriber",
@@ -6435,10 +6412,11 @@ dependencies = [

 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
 dependencies = [
+ "cfg-if",
 "log",
 "pin-project-lite",
 "tracing-attributes",
@@ -6458,9 +6436,9 @@ dependencies = [

 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6469,9 +6447,9 @@ dependencies = [

 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
 dependencies = [
 "once_cell",
 "valuable",
@@ -6529,7 +6507,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
- "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -6927,9 +6904,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
 dependencies = [
 "cfg-if",
 "wasm-bindgen-macro",
@@ -6937,9 +6914,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
 dependencies = [
 "bumpalo",
 "log",
@@ -6952,9 +6929,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.42"
+version = "0.4.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
+checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
 dependencies = [
 "cfg-if",
 "js-sys",
@@ -6964,9 +6941,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -6974,9 +6951,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6987,9 +6964,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"

 [[package]]
 name = "wasm-streams"
@@ -7021,9 +6998,9 @@ dependencies = [

 [[package]]
 name = "web-sys"
-version = "0.3.69"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
+checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
 dependencies = [
 "js-sys",
 "wasm-bindgen",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,10 +45,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.19"
-azure_identity = "0.19"
-azure_storage = "0.19"
-azure_storage_blobs = "0.19"
+azure_core = "0.18"
+azure_identity = "0.18"
+azure_storage = "0.18"
+azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -180,7 +180,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/18
+++ b/18
@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	ifndef DISABLE_HOMEBREW
-		# macOS with brew-installed openssl requires explicit paths
-		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
-	endif
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,7 +17,6 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
-humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -417,54 +417,6 @@ async fn handle_tenant(
                println!("{} {:?}", t.id, t.state);
            }
        }
-        Some(("import", import_match)) => {
-            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
-
-            let storage_controller = StorageController::from_env(env);
-            let create_response = storage_controller.tenant_import(tenant_id).await?;
-
-            let shard_zero = create_response
-                .shards
-                .first()
-                .expect("Import response omitted shards");
-
-            let attached_pageserver_id = shard_zero.node_id;
-            let pageserver =
-                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
-
-            println!(
-                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
-            );
-
-            let timelines = pageserver
-                .http_client
-                .list_timelines(shard_zero.shard_id)
-                .await?;
-
-            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
-            let main_timeline = timelines
-                .iter()
-                .find(|t| t.ancestor_timeline_id.is_none())
-                .expect("No timelines found")
-                .timeline_id;
-
-            let mut branch_i = 0;
-            for timeline in timelines.iter() {
-                let branch_name = if timeline.timeline_id == main_timeline {
-                    "main".to_string()
-                } else {
-                    branch_i += 1;
-                    format!("branch_{branch_i}")
-                };
-
-                println!(
-                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
-                    timeline.timeline_id
-                );
-
-                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
-            }
-        }
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
@@ -1528,8 +1480,6 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
-                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1554,8 +1504,8 @@ fn cli() -> Command {
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller"))
-                .subcommand(Command::new("stop").about("Stop storage controller")
+                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,7 +17,6 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use std::time::Duration;
 use utils::{
    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -67,10 +66,6 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    // Configuration for the storage controller (1 per neon_local environment)
-    #[serde(default)]
-    pub storage_controller: NeonStorageControllerConf,
-
    /// This Vec must always contain at least one pageserver
    pub pageservers: Vec<PageServerConf>,

@@ -103,29 +98,6 @@ pub struct NeonBroker {
    pub listen_addr: SocketAddr,
 }

-/// Broker config for cluster internal communication.
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
-pub struct NeonStorageControllerConf {
-    /// Heartbeat timeout before marking a node offline
-    #[serde(with = "humantime_serde")]
-    pub max_unavailable: Duration,
-}
-
-impl NeonStorageControllerConf {
-    // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
-        std::time::Duration::from_secs(10);
-}
-
-impl Default for NeonStorageControllerConf {
-    fn default() -> Self {
-        Self {
-            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
-        }
-    }
-}
-
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
    fn default() -> Self {
@@ -158,7 +130,6 @@ pub struct PageServerConf {
    pub(crate) virtual_file_io_engine: Option<String>,
    pub(crate) get_vectored_impl: Option<String>,
    pub(crate) get_impl: Option<String>,
-    pub(crate) validate_vectored_get: Option<bool>,
 }

 impl Default for PageServerConf {
@@ -172,7 +143,6 @@ impl Default for PageServerConf {
            virtual_file_io_engine: None,
            get_vectored_impl: None,
            get_impl: None,
-            validate_vectored_get: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -93,7 +93,6 @@ impl PageServerNode {
            virtual_file_io_engine,
            get_vectored_impl,
            get_impl,
-            validate_vectored_get,
        } = &self.conf;

        let id = format!("id={}", id);
@@ -118,11 +117,6 @@ impl PageServerNode {
        } else {
            String::new()
        };
-        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
-            format!("validate_vectored_get={validate_vectored_get}")
-        } else {
-            String::new()
-        };

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -137,7 +131,6 @@ impl PageServerNode {
            virtual_file_io_engine,
            get_vectored_impl,
            get_impl,
-            validate_vectored_get,
        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -448,11 +441,6 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("parse `timeline_get_throttle` from json")?,
-            switch_to_aux_file_v2: settings
-                .remove("switch_to_aux_file_v2")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -571,11 +559,6 @@ impl PageServerNode {
                    .map(serde_json::from_str)
                    .transpose()
                    .context("parse `timeline_get_throttle` from json")?,
-                switch_to_aux_file_v2: settings
-                    .remove("switch_to_aux_file_v2")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
            }
        };

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,7 +1,4 @@
-use crate::{
-    background_process,
-    local_env::{LocalEnv, NeonStorageControllerConf},
-};
+use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
@@ -35,13 +32,15 @@ pub struct StorageController {
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
-    config: NeonStorageControllerConf,
 }

 const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
-            config: env.storage_controller.clone(),
        }
    }

@@ -274,6 +272,8 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
        let mut args = vec![
            "-l",
            &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
-            &humantime::Duration::from(self.config.max_unavailable).to_string(),
+            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -472,16 +472,6 @@ impl StorageController {
            .await
    }

-    #[instrument(skip(self))]
-    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch::<(), TenantCreateResponse>(
-            Method::POST,
-            format!("debug/v1/tenant/{tenant_id}/import"),
-            None,
-        )
-        .await
-    }
-
    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.

-The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.

 The `diesel` crate is used for defining models & migrations.

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -4,6 +4,7 @@ use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
+use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};

 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -29,25 +30,24 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;

-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
-pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
-pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;

 /// The (reserved) key prefix of relation sizes.
-pub const RELATION_SIZE_PREFIX: u8 = 0x61;
+pub const RELATION_SIZE_PREFIX: u8 = 0x81;

 /// The key prefix of AUX file keys.
-pub const AUX_KEY_PREFIX: u8 = 0x62;
+pub const AUX_KEY_PREFIX: u8 = 0x82;

 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
-    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
+    key[0] >= METADATA_KEY_BEGIN_PREFIX
 }

 impl Key {
    /// Check if the key falls in the range of metadata keys.
    pub const fn is_metadata_key(&self) -> bool {
-        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX
    }

    /// Encode a metadata key to a storage key.
@@ -80,7 +80,7 @@ impl Key {
    }

    /// Get the range of metadata keys.
-    pub fn metadata_key_range() -> Range<Self> {
+    pub fn metadata_key_range() -> RangeInclusive<Self> {
        Key {
            field1: METADATA_KEY_BEGIN_PREFIX,
            field2: 0,
@@ -88,32 +88,13 @@ impl Key {
            field4: 0,
            field5: 0,
            field6: 0,
-        }..Key {
-            field1: METADATA_KEY_END_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
-    }
-
-    /// Get the range of aux keys.
-    pub fn metadata_aux_key_range() -> Range<Self> {
-        Key {
-            field1: AUX_KEY_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: AUX_KEY_PREFIX + 1,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
+        }..=Key {
+            field1: u8::MAX,
+            field2: u16::MAX as u32,
+            field3: u32::MAX,
+            field4: u32::MAX,
+            field5: u8::MAX,
+            field6: u32::MAX,
        }
    }

@@ -122,7 +103,7 @@ impl Key {
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0x7F) as i128) << 120)
+        (((self.field1 & 0xf) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
            | ((self.field4 as i128) << 40)
@@ -132,7 +113,7 @@ impl Key {

    pub const fn from_i128(x: i128) -> Self {
        Key {
-            field1: ((x >> 120) & 0x7F) as u8,
+            field1: ((x >> 120) & 0xf) as u8,
            field2: ((x >> 104) & 0xFFFF) as u32,
            field3: (x >> 72) as u32,
            field4: (x >> 40) as u32,
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,10 +17,6 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

-/// A wrapper type for sparse keyspaces.
-#[derive(Clone, Debug, Default, PartialEq, Eq)]
-pub struct SparseKeySpace(pub KeySpace);
-
 /// Represents a contiguous half-open range of the keyspace, masked according to a particular
 /// ShardNumber's stripes: within this range of keys, only some "belong" to the current
 /// shard.
@@ -36,34 +32,12 @@ pub struct ShardedRange<'a> {
    pub range: Range<Key>,
 }

-// Calculate the size of a range within the blocks of the same relation, or spanning only the
-// top page in the previous relation's space.
-fn contiguous_range_len(range: &Range<Key>) -> u32 {
-    debug_assert!(is_contiguous_range(range));
-    if range.start.field6 == 0xffffffff {
-        range.end.field6 + 1
-    } else {
-        range.end.field6 - range.start.field6
-    }
-}
-
-/// Return true if this key range includes only keys in the same relation's data blocks, or
-/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
-///
-/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
-/// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
-/// of a given contiguous range is present on one shard.
-///
-/// This matters, because:
-/// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
-/// - Within such ranges, we may calculate distances using simple subtraction of field6.
-fn is_contiguous_range(range: &Range<Key>) -> bool {
-    range.start.field1 == range.end.field1
-        && range.start.field2 == range.end.field2
-        && range.start.field3 == range.end.field3
-        && range.start.field4 == range.end.field4
-        && (range.start.field5 == range.end.field5
-            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
+// Calculate the distance between two keys, assuming that they are somewhat close
+// together (i.e. we only account for field5 and field6)
+fn nearby_key_delta(start: &Key, end: &Key) -> u64 {
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+    end - start
 }

 impl<'a> ShardedRange<'a> {
@@ -89,7 +63,11 @@ impl<'a> ShardedRange<'a> {
            )];
        }

-        if !is_contiguous_range(&self.range) {
+        if self.range.end.field1 != self.range.start.field1
+            || self.range.end.field2 != self.range.start.field2
+            || self.range.end.field3 != self.range.start.field3
+            || self.range.end.field4 != self.range.start.field4
+        {
            // Ranges that span relations are not fragmented.  We only get these ranges as a result
            // of operations that act on existing layers, so we trust that the existing range is
            // reasonably small.
@@ -100,7 +78,7 @@ impl<'a> ShardedRange<'a> {

        let mut cursor = self.range.start;
        while cursor < self.range.end {
-            let advance_by = self.distance_to_next_boundary(cursor);
+            let advance_by = self.advance_to_next_boundary(cursor);
            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);

            // If the previous fragment is undersized, then we seek to consume enough
@@ -158,23 +136,21 @@ impl<'a> ShardedRange<'a> {
    /// Estimate the physical pages that are within this range, on this shard.  This returns
    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
    pub fn page_count(&self) -> u32 {
-        // Special cases for single keys like logical sizes
-        if self.range.end == self.range.start.add(1) {
-            return if self.shard_identity.is_key_disposable(&self.range.start) {
-                0
-            } else {
-                1
-            };
-        }
-
-        // We can only do an authentic calculation of contiguous key ranges
-        if !is_contiguous_range(&self.range) {
+        let raw_size = Self::raw_size(&self.range);
+        if raw_size == u32::MAX {
            return u32::MAX;
        }

        // Special case for single sharded tenants: our logical and physical sizes are the same
        if self.shard_identity.count < ShardCount::new(2) {
-            return contiguous_range_len(&self.range);
+            return raw_size;
+        }
+
+        // Special cases for single keys like logical sizes
+        if self.range.end == self.range.start.add(1)
+            && self.shard_identity.is_key_local(&self.range.start)
+        {
+            return 1;
        }

        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
@@ -183,16 +159,18 @@ impl<'a> ShardedRange<'a> {
        let mut cursor = self.range.start;
        while cursor < self.range.end {
            // Count up to the next stripe_size boundary or end of range
-            let advance_by = self.distance_to_next_boundary(cursor);
+            let advance_by = self.advance_to_next_boundary(cursor);
+            cursor = cursor.add(advance_by);

            // If this blocks in this stripe belong to us, add them to our count
            if !self.shard_identity.is_key_disposable(&cursor) {
                result += advance_by as u64;
            }
-
-            cursor = cursor.add(advance_by);
        }

+        // Sharding should always decrease the number of pages we estimate, never increase it
+        debug_assert!(result <= raw_size as u64);
+
        if result > u32::MAX as u64 {
            u32::MAX
        } else {
@@ -202,37 +180,19 @@ impl<'a> ShardedRange<'a> {

    /// Advance the cursor to the next potential fragment boundary: this is either
    /// a stripe boundary, or the end of the range.
-    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
-        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
+    fn advance_to_next_boundary(&self, cursor: Key) -> u32 {
+        let distance_to_range_end = nearby_key_delta(&cursor, &self.range.end);

        if self.shard_identity.count < ShardCount::new(2) {
            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
-            return distance_to_range_end;
-        }
-
-        if cursor.field6 == 0xffffffff {
-            // We are wrapping from one relation's logical size to the next relation's first data block
-            return 1;
+            return Self::raw_size(&self.range);
        }

        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
        let stripe_remainder = self.shard_identity.stripe_size.0
            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);

-        if cfg!(debug_assertions) {
-            // We should never overflow field5 and field6 -- our callers check this earlier
-            // and would have returned their u32::MAX cases if the input range violated this.
-            let next_cursor = cursor.add(stripe_remainder);
-            debug_assert!(
-                next_cursor.field1 == cursor.field1
-                    && next_cursor.field2 == cursor.field2
-                    && next_cursor.field3 == cursor.field3
-                    && next_cursor.field4 == cursor.field4
-                    && next_cursor.field5 == cursor.field5
-            )
-        }
-
-        std::cmp::min(stripe_remainder, distance_to_range_end)
+        std::cmp::min(stripe_remainder as u64, distance_to_range_end) as u32
    }

    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
@@ -241,22 +201,29 @@ impl<'a> ShardedRange<'a> {
    ///
    /// Don't use this function in code that works with physical entities like layer files.
    fn raw_size(range: &Range<Key>) -> u32 {
-        if is_contiguous_range(range) {
-            contiguous_range_len(range)
-        } else {
+        let start = range.start;
+        let end = range.end;
+
+        if end.field1 != start.field1
+            || end.field2 != start.field2
+            || end.field3 != start.field3
+            || end.field4 != start.field4
+        {
+            return u32::MAX;
+        }
+
+        // The check above ensures that keys only differ in low fields (i.e. are nearby)
+        let diff = nearby_key_delta(&start, &end);
+        if diff > u32::MAX as u64 {
            u32::MAX
+        } else {
+            diff as u32
        }
    }
 }

 impl KeySpace {
-    /// Create a key space with a single range.
-    pub fn single(key_range: Range<Key>) -> Self {
-        Self {
-            ranges: vec![key_range],
-        }
-    }
-
+    ///
    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
    /// in each partition.
    ///
@@ -273,12 +240,12 @@ impl KeySpace {
            let range = ShardedRange::new(range.clone(), shard_identity);

            // Chunk up the range into parts that each contain up to target_size local blocks
-            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
+            for (range_size, range) in range.fragment(target_nblocks) {
                // If appending the next contiguous range in the keyspace to the current
                // partition would cause it to be too large, and our current partition
                // covers at least one block that is physically present in this shard,
                // then start a new partition
-                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
+                if current_part_size + range_size as usize > target_nblocks as usize
                    && current_part_size > 0
                {
                    parts.push(KeySpace {
@@ -287,8 +254,8 @@ impl KeySpace {
                    current_part = Vec::new();
                    current_part_size = 0;
                }
-                current_part.push(frag_range.start..frag_range.end);
-                current_part_size += frag_on_shard_size as usize;
+                current_part.push(range.start..range.end);
+                current_part_size += range_size as usize;
            }
        }

@@ -302,10 +269,6 @@ impl KeySpace {
        KeyPartitioning { parts }
    }

-    pub fn is_empty(&self) -> bool {
-        self.total_raw_size() == 0
-    }
-
    /// Merge another keyspace into the current one.
    /// Note: the keyspaces must not ovelap (enforced via assertions)
    pub fn merge(&mut self, other: &KeySpace) {
@@ -404,6 +367,10 @@ impl KeySpace {
            .sum()
    }

+    pub fn is_empty(&self) -> bool {
+        self.total_raw_size() == 0
+    }
+
    fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
            Ok(0) => None,
@@ -439,33 +406,10 @@ pub struct KeyPartitioning {
    pub parts: Vec<KeySpace>,
 }

-/// Represents a partitioning of the sparse key space.
-#[derive(Clone, Debug, Default)]
-pub struct SparseKeyPartitioning {
-    pub parts: Vec<SparseKeySpace>,
-}
-
 impl KeyPartitioning {
    pub fn new() -> Self {
        KeyPartitioning { parts: Vec::new() }
    }
-
-    /// Convert a key partitioning to a sparse partition.
-    pub fn into_sparse(self) -> SparseKeyPartitioning {
-        SparseKeyPartitioning {
-            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
-        }
-    }
-}
-
-impl SparseKeyPartitioning {
-    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
-    /// cause long/dead loops.
-    pub fn into_dense(self) -> KeyPartitioning {
-        KeyPartitioning {
-            parts: self.parts.into_iter().map(|x| x.0).collect(),
-        }
-    }
 }

 ///
@@ -593,8 +537,6 @@ pub fn singleton_range(key: Key) -> Range<Key> {

 #[cfg(test)]
 mod tests {
-    use rand::{RngCore, SeedableRng};
-
    use crate::{
        models::ShardParameters,
        shard::{ShardCount, ShardNumber},
@@ -990,35 +932,6 @@ mod tests {
        assert_eq!(range.page_count(), 1);
    }

-    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
-    #[test]
-    fn contiguous_range_check() {
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
-        ),);
-
-        // The ranges goes all the way up to the 0xffffffff, including it: this is
-        // not considered a rel block range because 0xffffffff stores logical sizes,
-        // not blocks.
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
-        ),);
-
-        // Keys within the normal data region of a relation
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
-        ),);
-
-        // The logical size key of one forkno, then some blocks in the next
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
-        ),);
-    }
-
    #[test]
    fn shard_identity_keyspaces_forkno_gap() {
        let shard_identity = ShardIdentity::new(
@@ -1036,10 +949,10 @@ mod tests {
            &shard_identity,
        );

-        // Range spanning the end of one forkno and the start of the next: we do not attempt to
-        // calculate a valid size, because we have no way to know if they keys between start
-        // and end are actually in use.
-        assert_eq!(range.page_count(), u32::MAX);
+        // Range spanning the end of one forkno and the start of the next, but not intersecting this shard's stripes
+        // This is technically an under-count, as the logical size key would be stored on this shard, but that's okay
+        // because page_count is allowed to under-count: it just mustn't over-count.
+        assert_eq!(range.page_count(), 0);
    }

    #[test]
@@ -1092,10 +1005,6 @@ mod tests {
        // Invariant: we always get at least one fragment
        assert!(!fragments.is_empty());

-        // Invariant: the first/last fragment start/end should equal the input start/end
-        assert_eq!(fragments.first().unwrap().1.start, range_start);
-        assert_eq!(fragments.last().unwrap().1.end, range_end);
-
        if page_count > 0 {
            // Invariant: every fragment must contain at least one shard-local page, if the
            // total range contains at least one shard-local page
@@ -1109,21 +1018,6 @@ mod tests {
            assert_eq!(fragments, vec![(0, range_start..range_end)]);
        }

-        // Invariant: fragments must be ordered and non-overlapping
-        let mut last: Option<Range<Key>> = None;
-        for frag in &fragments {
-            if let Some(last) = last {
-                assert!(frag.1.start >= last.end);
-                assert!(frag.1.start > last.start);
-            }
-            last = Some(frag.1.clone())
-        }
-
-        // Invariant: fragments respect target_nblocks
-        for frag in &fragments {
-            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
-        }
-
        (page_count, fragments)
    }

@@ -1206,39 +1100,6 @@ mod tests {
        );
    }

-    /// Test our calculations work correctly when we start a range from the logical size key of
-    /// a previous relation.
-    #[test]
-    fn sharded_range_fragment_starting_from_logical_size() {
-        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
-
-        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x8001, vec![(0x8001, input_start..input_end)])
-        );
-
-        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
-        // store all logical sizes)
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x1, vec![(0x1, input_start..input_end)])
-        );
-    }
-
    /// Test that ShardedRange behaves properly when used on un-sharded data
    #[test]
    fn sharded_range_fragment_unsharded() {
@@ -1282,79 +1143,4 @@ mod tests {
            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
        );
    }
-
-    #[test]
-    fn sharded_range_fragment_tiny_nblocks() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
-        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16),
-            (
-                0x38,
-                vec![
-                    (16, input_start..input_start.add(16)),
-                    (16, input_start.add(16)..input_start.add(32)),
-                    (16, input_start.add(32)..input_start.add(48)),
-                    (8, input_start.add(48)..input_end),
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_fuzz() {
-        // Use a fixed seed: we don't want to explicitly pick values, but we do want
-        // the test to be reproducible.
-        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
-
-        for _i in 0..1000 {
-            let shard_identity = if prng.next_u32() % 2 == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                let shard_count = prng.next_u32() % 127 + 1;
-                ShardIdentity::new(
-                    ShardNumber((prng.next_u32() % shard_count) as u8),
-                    ShardCount::new(shard_count as u8),
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
-                )
-                .unwrap()
-            };
-
-            let target_nblocks = prng.next_u32() % 65536 + 1;
-
-            let start_offset = prng.next_u32() % 16384;
-
-            // Try ranges up to 4GiB in size, that are always at least 1
-            let range_size = prng.next_u32() % 8192 + 1;
-
-            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
-                .unwrap()
-                .add(start_offset);
-            let input_end = input_start.add(range_size);
-
-            // This test's main success conditions are the invariants baked into do_fragment
-            let (_total_size, fragments) =
-                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
-
-            // Pick a random key within the range and check it appears in the output
-            let example_key = input_start.add(prng.next_u32() % range_size);
-
-            // Panic on unwrap if it isn't found
-            let example_key_frag = fragments
-                .iter()
-                .find(|f| f.1.contains(&example_key))
-                .unwrap();
-
-            // Check that the fragment containing our random key has a nonzero size if
-            // that key is shard-local
-            let example_key_local = !shard_identity.is_key_disposable(&example_key);
-            if example_key_local {
-                assert!(example_key_frag.0 > 0);
-            }
-        }
-    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -303,7 +303,6 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_to_aux_file_v2: Option<bool>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -782,17 +781,6 @@ pub struct SecondaryProgress {
    pub bytes_total: u64,
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantScanRemoteStorageShard {
-    pub tenant_shard_id: TenantShardId,
-    pub generation: Option<u32>,
-}
-
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub struct TenantScanRemoteStorageResponse {
-    pub shards: Vec<TenantScanRemoteStorageShard>,
-}
-
 pub mod virtual_file {
    #[derive(
        Copy,
@@ -860,72 +848,39 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
    }
 }

-// In the V2 protocol version, a GetPage request contains two LSN values:
-//
-// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
-// "get the latest version present". It's used by the primary server, which knows that no one else
-// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
-// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
-//
-// not_modified_since: Hint to the pageserver that the client knows that the page has not been
-// modified between 'not_modified_since' and the request LSN. It's always correct to set
-// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
-// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
-// request without waiting for 'request_lsn' to arrive.
-//
-// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
-// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
-// 'latest' was set to true. The V2 interface was added because there was no correct way for a
-// standby to request a page at a particular non-latest LSN, and also include the
-// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
-// request, if the standby knows that the page hasn't been modified since, and risk getting an error
-// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
-// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
-// difference in the responses between V1 and V2.
-//
-// The Request structs below reflect the V2 interface. If V1 is used, the parse function
-// maps the old format requests to the new format.
-//
-#[derive(Clone, Copy)]
-pub enum PagestreamProtocolVersion {
-    V1,
-    V2,
-}
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
    pub blkno: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub dbnode: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub kind: u8,
    pub segno: u32,
 }
@@ -972,16 +927,14 @@ pub struct TenantHistorySize {
 }

 impl PagestreamFeMessage {
-    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 2.
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

        match self {
            Self::Exists(req) => {
                bytes.put_u8(0);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -990,8 +943,8 @@ impl PagestreamFeMessage {

            Self::Nblocks(req) => {
                bytes.put_u8(1);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1000,8 +953,8 @@ impl PagestreamFeMessage {

            Self::GetPage(req) => {
                bytes.put_u8(2);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1011,15 +964,15 @@ impl PagestreamFeMessage {

            Self::DbSize(req) => {
                bytes.put_u8(3);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }

            Self::GetSlruSegment(req) => {
                bytes.put_u8(4);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u8(req.kind);
                bytes.put_u32(req.segno);
            }
@@ -1028,40 +981,18 @@ impl PagestreamFeMessage {
        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;
-
-        let (request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V1 => {
-                // In the old protocol, each message starts with a boolean 'latest' flag,
-                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
-                // 'not_modified_since', used in the new protocol version.
-                let latest = body.read_u8()? != 0;
-                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-                if latest {
-                    (Lsn::MAX, request_lsn) // get latest version
-                } else {
-                    (request_lsn, request_lsn) // get version at specified LSN
-                }
-            }
-        };
-
-        // The rest of the messages are the same between V1 and V2
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1070,8 +1001,8 @@ impl PagestreamFeMessage {
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1080,8 +1011,8 @@ impl PagestreamFeMessage {
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1091,14 +1022,14 @@ impl PagestreamFeMessage {
                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                dbnode: body.read_u32::<BigEndian>()?,
            })),
            4 => Ok(PagestreamFeMessage::GetSlruSegment(
                PagestreamGetSlruSegmentRequest {
-                    request_lsn,
-                    not_modified_since,
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                    kind: body.read_u8()?,
                    segno: body.read_u32::<BigEndian>()?,
                },
@@ -1226,8 +1157,8 @@ mod tests {
        // Test serialization/deserialization of PagestreamFeMessage
        let messages = vec![
            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1236,8 +1167,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(4),
+                latest: false,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1246,8 +1177,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1257,16 +1188,14 @@ mod tests {
                blkno: 7,
            }),
            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                dbnode: 7,
            }),
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
-                    .unwrap();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
            assert!(msg == reconstructed);
        }
    }
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,11 +1,9 @@
 use utils::lsn::Lsn;

-use crate::keyspace::SparseKeySpace;
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
    pub keys: crate::keyspace::KeySpace,
-    pub sparse_keys: crate::keyspace::SparseKeySpace,
+
    pub at_lsn: Lsn,
 }

@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
        let mut map = serializer.serialize_map(Some(2))?;
        map.serialize_key("keys")?;
        map.serialize_value(&KeySpace(&self.keys))?;
-        map.serialize_key("sparse_keys")?;
-        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
        map.serialize_key("at_lsn")?;
        map.serialize_value(&WithDisplay(&self.at_lsn))?;
        map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        #[derive(serde::Deserialize)]
        struct De {
            keys: KeySpace,
-            sparse_keys: KeySpace,
            #[serde_as(as = "serde_with::DisplayFromStr")]
            at_lsn: Lsn,
        }
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        Ok(Self {
            at_lsn: de.at_lsn,
            keys: de.keys.0,
-            sparse_keys: SparseKeySpace(de.sparse_keys.0),
        })
    }
 }
@@ -139,12 +133,6 @@ mod tests {
                "030000000000000000000000000000000003"
              ]
            ],
-            "sparse_keys": [
-              [
-                "620000000000000000000000000000000000",
-                "620000000000000000000000000000000003"
-              ]
-            ],
            "at_lsn": "0/2240160"
        }
        "#;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -538,6 +538,24 @@ impl ShardIdentity {
        }
    }

+    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
+    ///
+    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
+    /// as a symptom of that issue.
+    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
+        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
+            return false;
+        }
+
+        let mut hash = murmurhash32(key.field4);
+        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
+        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
+
+        // The key may be affected by issue #7454: it is an initfork and it would not
+        // have mapped to shard 0 until we fixed that issue.
+        mapped_shard != ShardNumber(0)
+    }
+
    /// Return true if the key should be discarded if found in this shard's
    /// data store, e.g. during compaction after a split.
    ///
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,8 +34,6 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
-    pub const MAX: Self = Self::Valid(u32::MAX);
-
    /// Create a new Generation that represents a legacy key format with
    /// no generation suffix
    pub fn none() -> Self {
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,10 +2,11 @@

 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
+use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{self, channel};
+use tokio::sync::watch::{channel, Receiver, Sender};
 use tokio::time::timeout;

 /// An error happened while waiting for a number
@@ -34,73 +35,23 @@ pub trait MonotonicCounter<V> {
    fn cnt_value(&self) -> V;
 }

-/// Heap of waiters, lowest numbers pop first.
-struct Waiters<V>
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
 where
+    S: MonotonicCounter<V>,
    V: Ord,
 {
-    heap: BinaryHeap<Waiter<V>>,
-    /// Number of the first waiter in the heap, or None if there are no waiters.
-    status_channel: watch::Sender<Option<V>>,
-}
-
-impl<V> Waiters<V>
-where
-    V: Ord + Copy,
-{
-    fn new() -> Self {
-        Waiters {
-            heap: BinaryHeap::new(),
-            status_channel: channel(None).0,
-        }
-    }
-
-    /// `status_channel` contains the number of the first waiter in the heap.
-    /// This function should be called whenever waiters heap changes.
-    fn update_status(&self) {
-        let first_waiter = self.heap.peek().map(|w| w.wake_num);
-        let _ = self.status_channel.send_replace(first_waiter);
-    }
-
-    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
-    fn add(&mut self, num: V) -> watch::Receiver<()> {
-        let (tx, rx) = channel(());
-        self.heap.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
-        self.update_status();
-        rx
-    }
-
-    /// Pop all waiters <= num from the heap. Collect channels in a vector,
-    /// so that caller can wake them up.
-    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
-        let mut wake_these = Vec::new();
-        while let Some(n) = self.heap.peek() {
-            if n.wake_num > num {
-                break;
-            }
-            wake_these.push(self.heap.pop().unwrap().wake_channel);
-        }
-        self.update_status();
-        wake_these
-    }
-
-    /// Used on shutdown to efficiently drop all waiters.
-    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
-        let heap = mem::take(&mut self.heap);
-        self.update_status();
-        heap
-    }
+    waiters: BinaryHeap<Waiter<V>>,
+    current: S,
+    shutdown: bool,
 }

 struct Waiter<T>
 where
    T: Ord,
 {
-    wake_num: T,                     // wake me when this number arrives ...
-    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
+    wake_num: T,              // wake me when this number arrives ...
+    wake_channel: Sender<()>, // ... by sending a message to this channel
 }

 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -125,17 +76,6 @@ impl<T: Ord> PartialEq for Waiter<T> {

 impl<T: Ord> Eq for Waiter<T> {}

-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
-where
-    S: MonotonicCounter<V>,
-    V: Ord,
-{
-    waiters: Waiters<V>,
-    current: S,
-    shutdown: bool,
-}
-
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -168,7 +108,7 @@ where
    /// Create a new `SeqWait`, initialized to a particular number
    pub fn new(starting_num: S) -> Self {
        let internal = SeqWaitInt {
-            waiters: Waiters::new(),
+            waiters: BinaryHeap::new(),
            current: starting_num,
            shutdown: false,
        };
@@ -188,8 +128,9 @@ where
            // Block any future waiters from starting
            internal.shutdown = true;

-            // Take all waiters to drop them later.
-            internal.waiters.take_all()
+            // This will steal the entire waiters map.
+            // When we drop it all waiters will be woken.
+            mem::take(&mut internal.waiters)

            // Drop the lock as we exit this scope.
        };
@@ -255,7 +196,7 @@ where

    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
        let mut internal = self.internal.lock().unwrap();
        if internal.current.cnt_value() >= num {
            return Ok(None);
@@ -264,8 +205,12 @@ where
            return Err(SeqWaitError::Shutdown);
        }

-        // Add waiter channel to the queue.
-        let rx = internal.waiters.add(num);
+        // Create a new channel.
+        let (tx, rx) = channel(());
+        internal.waiters.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
        // Drop the lock as we exit this scope.
        Ok(Some(rx))
    }
@@ -286,8 +231,16 @@ where
            }
            internal.current.cnt_advance(num);

-            // Pop all waiters <= num from the heap.
-            internal.waiters.pop_leq(num)
+            // Pop all waiters <= num from the heap. Collect them in a vector, and
+            // wake them up after releasing the lock.
+            let mut wake_these = Vec::new();
+            while let Some(n) = internal.waiters.peek() {
+                if n.wake_num > num {
+                    break;
+                }
+                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
+            }
+            wake_these
        };

        for tx in wake_these {
@@ -302,23 +255,6 @@ where
    pub fn load(&self) -> S {
        self.internal.lock().unwrap().current
    }
-
-    /// Get a Receiver for the current status.
-    ///
-    /// The current status is the number of the first waiter in the queue,
-    /// or None if there are no waiters.
-    ///
-    /// This receiver will be notified whenever the status changes.
-    /// It is useful for receiving notifications when the first waiter
-    /// starts waiting for a number, or when there are no more waiters left.
-    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
-        self.internal
-            .lock()
-            .unwrap()
-            .waiters
-            .status_channel
-            .subscribe()
-    }
 }

 #[cfg(test)]
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -243,19 +243,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_scan_remote_storage(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantScanRemoteStorageResponse> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/scan_remote_storage",
-            self.mgmt_api_endpoint
-        );
-        let response = self.request(Method::GET, &uri, ()).await?;
-        let body = response.json().await.map_err(Error::ReceiveBody)?;
-        Ok(body)
-    }
-
    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
    ) -> anyhow::Result<PagestreamClient> {
        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
            .client
-            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
            .await?;
        let Client {
            cancel_on_client_drop,
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -312,12 +312,8 @@ async fn main_impl(
                    let (rel_tag, block_no) =
                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
-                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                            Lsn::MAX
-                        } else {
-                            r.timeline_lsn
-                        },
-                        not_modified_since: r.timeline_lsn,
+                        latest: rng.gen_bool(args.req_latest_probability),
+                        lsn: r.timeline_lsn,
                        rel: rel_tag,
                        blkno: block_no,
                    }
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -85,27 +85,27 @@ mod tests {
        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
        // of the page server.
        assert_eq!(
-            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
            encode_aux_file_key("pg_logical/mappings/test1").to_string()
        );
        assert_eq!(
-            "620000010239AAC544893139B26F501B97E6",
+            "820000010239AAC544893139B26F501B97E6",
            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
        );
        assert_eq!(
-            "620000010300000000000000000000000000",
+            "820000010300000000000000000000000000",
            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
        );
        assert_eq!(
-            "62000001FF8635AF2134B7266EC5B4189FD6",
+            "82000001FF8635AF2134B7266EC5B4189FD6",
            encode_aux_file_key("pg_logical/unsupported").to_string()
        );
        assert_eq!(
-            "6200000201772D0E5D71DE14DA86142A1619",
+            "8200000201772D0E5D71DE14DA86142A1619",
            encode_aux_file_key("pg_replslot/test3").to_string()
        );
        assert_eq!(
-            "620000FFFF1866EBEB53B807B26A2416F317",
+            "820000FFFF1866EBEB53B807B26A2416F317",
            encode_aux_file_key("other_file_not_supported").to_string()
        );
    }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -300,7 +300,20 @@ where
                if rel.forknum == INIT_FORKNUM {
                    // I doubt we need _init fork itself, but having it at least
                    // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
+                        if self
+                            .timeline
+                            .get_shard_identity()
+                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
+                        {
+                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
+                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
+                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
+                            // recreate.
+                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
+                            continue;
+                        }
+                    };
                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                    continue;
                }
@@ -366,7 +379,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -387,7 +400,7 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,8 +19,6 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
-use pageserver_api::models::TenantScanRemoteStorageResponse;
-use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
@@ -31,7 +29,6 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -57,9 +54,6 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
-use crate::tenant::remote_timeline_client::download_index_part;
-use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
-use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -1918,14 +1912,12 @@ async fn timeline_collect_keyspace(
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let (dense_ks, sparse_ks) = timeline
+        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
-        // Therefore, we split dense/sparse keys in this API.
-        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
+        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };

        json_response(StatusCode::OK, res)
    }
@@ -2043,79 +2035,6 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_scan_remote_handler(
-    request: Request<Body>,
-    cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-
-    let Some(remote_storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Remote storage not configured"
-        )));
-    };
-
-    let mut response = TenantScanRemoteStorageResponse::default();
-
-    let (shards, _other_keys) =
-        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
-            .await
-            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-
-    for tenant_shard_id in shards {
-        let (timeline_ids, _other_keys) =
-            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
-                .await
-                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-
-        let mut generation = Generation::none();
-        for timeline_id in timeline_ids {
-            match download_index_part(
-                remote_storage,
-                &tenant_shard_id,
-                &timeline_id,
-                Generation::MAX,
-                &cancel,
-            )
-            .instrument(info_span!("download_index_part",
-                         tenant_id=%tenant_shard_id.tenant_id,
-                         shard_id=%tenant_shard_id.shard_slug(),
-                         %timeline_id))
-            .await
-            {
-                Ok((index_part, index_generation)) => {
-                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
-                    generation = std::cmp::max(generation, index_generation);
-                }
-                Err(DownloadError::NotFound) => {
-                    // This is normal for tenants that were created with multiple shards: they have an unsharded path
-                    // containing the timeline's initdb tarball but no index.  Otherwise it is a bit strange.
-                    tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
-                    continue;
-                }
-                Err(e) => {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
-                }
-            };
-        }
-
-        response.shards.push(TenantScanRemoteStorageShard {
-            tenant_shard_id,
-            generation: generation.into(),
-        });
-    }
-
-    if response.shards.is_empty() {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(),
-        ));
-    }
-
-    json_response(StatusCode::OK, response)
-}
-
 async fn secondary_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -2512,9 +2431,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
-        .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| {
-            api_handler(r, tenant_scan_remote_handler)
-        })
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,9 +51,6 @@ pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "gc")]
    Gc,

-    #[strum(serialize = "update gc info")]
-    UpdateGcInfo,
-
    #[strum(serialize = "create tenant")]
    CreateTenant,
 }
@@ -1913,22 +1910,6 @@ impl StorageTimeMetricsTimer {
        self.metrics.timeline_count.inc();
        self.metrics.global_histogram.observe(duration);
    }
-
-    /// Turns this timer into a timer, which will always record -- usually this means recording
-    /// regardless an early `?` path was taken in a function.
-    pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
-        AlwaysRecordingStorageTimeMetricsTimer(Some(self))
-    }
-}
-
-pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
-
-impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
-    fn drop(&mut self) {
-        if let Some(inner) = self.0.take() {
-            inner.stop_and_record();
-        }
-    }
 }

 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
@@ -1989,7 +1970,6 @@ pub(crate) struct TimelineMetrics {
    pub imitate_logical_size_histo: StorageTimeMetrics,
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
-    pub update_gc_info_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2050,12 +2030,6 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let update_gc_info_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::UpdateGcInfo,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2098,7 +2072,6 @@ impl TimelineMetrics {
            logical_size_histo,
            imitate_logical_size_histo,
            garbage_collect_histo,
-            update_gc_info_histo,
            load_layer_map_histo,
            last_record_gauge,
            resident_physical_size_gauge,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,5 +1,13 @@
+//
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
+//
+//   It is possible to connect here using usual psql/pgbench/libpq. Following
+// commands are supported now:
+//     *status* -- show actual info about this pageserver,
+//     *pagestream* -- enter mode where smgr and pageserver talk with their
+//  custom protocol.
+//

 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
@@ -15,7 +23,7 @@ use pageserver_api::models::{
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
-    PagestreamNblocksResponse, PagestreamProtocolVersion,
+    PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
@@ -543,7 +551,6 @@ impl PageServerHandler {
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        protocol_version: PagestreamProtocolVersion,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -606,15 +613,14 @@ impl PageServerHandler {
                t.trace(&copy_data_bytes)
            }

-            let neon_fe_msg =
-                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

            // TODO: We could create a new per-request context here, with unique ID.
            // Currently we use the same per-timeline context for all requests

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -623,7 +629,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -633,7 +639,7 @@ impl PageServerHandler {
                }
                PagestreamFeMessage::GetPage(req) => {
                    // shard_id is filled in by the handler
-                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -642,7 +648,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -651,7 +657,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
-                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -832,80 +838,83 @@ impl PageServerHandler {
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
-    /// which version of the page is being requested. The primary compute node
-    /// will always request the latest page version, by setting 'request_lsn' to
-    /// the last inserted or flushed WAL position, while a standby will request
-    /// a version at the LSN that it's currently caught up to.
+    /// which version of the page is being requested. The client can request the
+    /// latest version of the page, or the version that's valid at a particular
+    /// LSN. The primary compute node will always request the latest page
+    /// version, while a standby will request a version at the LSN that it's
+    /// currently caught up to.
    ///
    /// In either case, if the page server hasn't received the WAL up to the
    /// requested LSN yet, we will wait for it to arrive. The return value is
    /// the LSN that should be used to look up the page versions.
-    ///
-    /// In addition to the request LSN, each request carries another LSN,
-    /// 'not_modified_since', which is a hint to the pageserver that the client
-    /// knows that the page has not been modified between 'not_modified_since'
-    /// and the request LSN. This allows skipping the wait, as long as the WAL
-    /// up to 'not_modified_since' has arrived. If the client doesn't have any
-    /// information about when the page was modified, it will use
-    /// not_modified_since == lsn. If the client lies and sends a too low
-    /// not_modified_hint such that there are in fact later page versions, the
-    /// behavior is undefined: the pageserver may return any of the page versions
-    /// or an error.
    async fn wait_or_get_last_lsn(
        timeline: &Timeline,
-        request_lsn: Lsn,
-        not_modified_since: Lsn,
+        mut lsn: Lsn,
+        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Lsn, PageStreamError> {
-        let last_record_lsn = timeline.get_last_record_lsn();
+        if latest {
+            // Latest page version was requested. If LSN is given, it is a hint
+            // to the page server that there have been no modifications to the
+            // page after that LSN. If we haven't received WAL up to that point,
+            // wait until it arrives.
+            let last_record_lsn = timeline.get_last_record_lsn();

-        // Sanity check the request
-        if request_lsn < not_modified_since {
-            return Err(PageStreamError::BadRequest(
-                format!(
-                    "invalid request with request LSN {} and not_modified_since {}",
-                    request_lsn, not_modified_since,
-                )
-                .into(),
-            ));
-        }
-
-        if request_lsn < **latest_gc_cutoff_lsn {
-            // Check explicitly for INVALID just to get a less scary error message if the
-            // request is obviously bogus
-            return Err(if request_lsn == Lsn::INVALID {
-                PageStreamError::BadRequest("invalid LSN(0) in request".into())
+            // Note: this covers the special case that lsn == Lsn(0). That
+            // special case means "return the latest version whatever it is",
+            // and it's used for bootstrapping purposes, when the page server is
+            // connected directly to the compute node. That is needed because
+            // when you connect to the compute node, to receive the WAL, the
+            // walsender process will do a look up in the pg_authid catalog
+            // table for authentication. That poses a deadlock problem: the
+            // catalog table lookup will send a GetPage request, but the GetPage
+            // request will block in the page server because the recent WAL
+            // hasn't been received yet, and it cannot be received until the
+            // walsender completes the authentication and starts streaming the
+            // WAL.
+            if lsn <= last_record_lsn {
+                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
+                // last_record_lsn. That would give the same result, since we know
+                // that there haven't been modifications since 'lsn'. Using an older
+                // LSN might be faster, because that could allow skipping recent
+                // layers when finding the page.
+                lsn = last_record_lsn;
            } else {
-                PageStreamError::BadRequest(format!(
-                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                        request_lsn, **latest_gc_cutoff_lsn
-                    ).into())
-            });
-        }
-
-        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
-        if not_modified_since > last_record_lsn {
+                timeline
+                    .wait_lsn(
+                        lsn,
+                        crate::tenant::timeline::WaitLsnWaiter::PageService,
+                        ctx,
+                    )
+                    .await?;
+                // Since we waited for 'lsn' to arrive, that is now the last
+                // record LSN. (Or close enough for our purposes; the
+                // last-record LSN can advance immediately after we return
+                // anyway)
+            }
+        } else {
+            if lsn == Lsn(0) {
+                return Err(PageStreamError::BadRequest(
+                    "invalid LSN(0) in request".into(),
+                ));
+            }
            timeline
                .wait_lsn(
-                    not_modified_since,
+                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    ctx,
                )
                .await?;
-            // Since we waited for 'not_modified_since' to arrive, that is now the last
-            // record LSN. (Or close enough for our purposes; the last-record LSN can
-            // advance immediately after we return anyway)
-            Ok(not_modified_since)
-        } else {
-            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
-            // here instead. That would give the same result, since we know that there
-            // haven't been any modifications since 'not_modified_since'. Using an older
-            // LSN might be faster, because that could allow skipping recent layers when
-            // finding the page. However, we have historically used 'last_record_lsn', so
-            // stick to that for now.
-            Ok(std::cmp::min(last_record_lsn, request_lsn))
        }
+
+        if lsn < **latest_gc_cutoff_lsn {
+            return Err(PageStreamError::BadRequest(format!(
+                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                lsn, **latest_gc_cutoff_lsn
+            ).into()));
+        }
+        Ok(lsn)
    }

    #[instrument(skip_all, fields(shard_id))]
@@ -922,17 +931,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -955,17 +959,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -988,17 +987,18 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::Lsn(lsn),
+                req.latest,
+                ctx,
+            )
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -1165,17 +1165,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -1198,14 +1193,9 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let kind = SlruKind::from_repr(req.kind)
            .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
@@ -1423,34 +1413,7 @@ where

        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream_v2 ") {
-            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for pagestream command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V2,
-                ctx,
-            )
-            .await?;
-        } else if query_string.starts_with("pagestream ") {
+        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 2 {
@@ -1469,14 +1432,8 @@ where

            self.check_permission(Some(tenant_id))?;

-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V1,
-                ctx,
-            )
-            .await?;
+            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
+                .await?;
        } else if query_string.starts_with("basebackup ") {
            let (_, params_raw) = query_string.split_at("basebackup ".len());
            let params = params_raw.split_whitespace().collect::<Vec<_>>();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,7 +23,6 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
-use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -177,6 +176,7 @@ impl Timeline {
        tag: RelTag,
        blknum: BlockNumber,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if tag.relnode == 0 {
@@ -185,7 +185,7 @@ impl Timeline {
            ));
        }

-        let nblocks = self.get_rel_size(tag, version, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -207,6 +207,7 @@ impl Timeline {
        spcnode: Oid,
        dbnode: Oid,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<usize, PageReconstructError> {
        let mut total_blocks = 0;
@@ -214,7 +215,7 @@ impl Timeline {
        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
@@ -225,6 +226,7 @@ impl Timeline {
        &self,
        tag: RelTag,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
@@ -238,7 +240,7 @@ impl Timeline {
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, ctx).await?
+            && !self.get_rel_exists(tag, version, latest, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -261,6 +263,7 @@ impl Timeline {
        &self,
        tag: RelTag,
        version: Version<'_>,
+        _latest: bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
@@ -446,6 +449,11 @@ impl Timeline {
        // include physical changes from later commits that will be marked
        // as aborted, and will need to be vacuumed away.
        let commit_lsn = Lsn((low - 1) * 8);
+        // This maxing operation is for the edge case that the search above did
+        // set found_smaller to true but it never increased the lsn. Then, low
+        // is still the old min_lsn the subtraction above could possibly give a value
+        // below the anchestor_lsn.
+        let commit_lsn = commit_lsn.max(min_lsn);
        match (found_smaller, found_larger) {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
@@ -456,12 +464,6 @@ impl Timeline {
                // Didn't find any commit timestamps smaller than the request
                Ok(LsnForTimestamp::Past(min_lsn))
            }
-            (true, _) if commit_lsn < min_lsn => {
-                // the search above did set found_smaller to true but it never increased the lsn.
-                // Then, low is still the old min_lsn, and the subtraction above gave a value
-                // below the min_lsn. We should never do that.
-                Ok(LsnForTimestamp::Past(min_lsn))
-            }
            (true, false) => {
                // Only found commits with timestamps smaller than the request.
                // It's still a valid case for branch creation, return it.
@@ -731,13 +733,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    ///
-    /// The return value is (dense keyspace, sparse keyspace).
    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -809,12 +809,7 @@ impl Timeline {
        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
            result.add_key(AUX_FILES_KEY);
        }
-
-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
-        ))
+        Ok(result.to_keyspace())
    }

    /// Get cached size of relation if it not updated after specified LSN
@@ -1100,7 +1095,7 @@ impl<'a> DatadirModification<'a> {
    ) -> anyhow::Result<()> {
        let total_blocks = self
            .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
            .await?;

        // Remove entry from dbdir
@@ -1199,7 +1194,7 @@ impl<'a> DatadirModification<'a> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        if self
            .tline
-            .get_rel_exists(rel, Version::Modified(self), ctx)
+            .get_rel_exists(rel, Version::Modified(self), true, ctx)
            .await?
        {
            let size_key = rel_size_to_key(rel);
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -361,8 +361,6 @@ pub enum TaskKind {

    DebugTool,

-    EphemeralFilePreWarmPageCache,
-
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -888,7 +888,7 @@ impl Tenant {

    #[instrument(skip_all)]
    pub(crate) async fn preload(
-        self: &Arc<Self>,
+        self: &Arc<Tenant>,
        remote_storage: &GenericRemoteStorage,
        cancel: CancellationToken,
    ) -> anyhow::Result<TenantPreload> {
@@ -918,13 +918,9 @@ impl Tenant {

        Ok(TenantPreload {
            deleting,
-            timelines: Self::load_timeline_metadata(
-                self,
-                remote_timeline_ids,
-                remote_storage,
-                cancel,
-            )
-            .await?,
+            timelines: self
+                .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel)
+                .await?,
        })
    }

@@ -3406,11 +3402,7 @@ impl Tenant {
        // is in progress (which is not a common case).
        //
        // See more for on the issue #2748 condenced out of the initial PR review.
-        let mut shared_cache = tokio::select! {
-            locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
-            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
-        };
+        let mut shared_cache = self.cached_logical_sizes.lock().await;

        size::gather_inputs(
            self,
@@ -3672,7 +3664,6 @@ pub(crate) mod harness {
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
-                switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
            }
        }
    }
@@ -3873,7 +3864,6 @@ mod tests {
    use hex_literal::hex;
    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
    use rand::{thread_rng, Rng};
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4513,23 +4503,11 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        timeline: Arc<Timeline>,
-        ctx: &RequestContext,
-        lsn: Lsn,
-        repeat: usize,
-        key_count: usize,
-    ) -> anyhow::Result<()> {
-        let compact = true;
-        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
-    }
-
-    async fn bulk_insert_maybe_compact_gc(
        timeline: Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
        repeat: usize,
        key_count: usize,
-        compact: bool,
    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;
@@ -4570,11 +4548,9 @@ mod tests {
                )
                .await?;
            timeline.freeze_and_flush().await?;
-            if compact {
-                timeline
-                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                    .await?;
-            }
+            timeline
+                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                .await?;
            timeline.gc().await?;
        }

@@ -5057,22 +5033,7 @@ mod tests {

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
-            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_random_updates_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_random_updates_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_random_updates")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5137,7 +5098,7 @@ mod tests {
                );
            }

-            // Perform a cycle of flush, and GC
+            // Perform a cycle of flush, compact, and GC
            let cutoff = tline.get_last_record_lsn();
            tline
                .update_gc_info(
@@ -5149,6 +5110,9 @@ mod tests {
                )
                .await?;
            tline.freeze_and_flush().await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
            tline.gc().await?;
        }

@@ -5429,36 +5393,19 @@ mod tests {

    #[tokio::test]
    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
-            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_read_at_max_lsn_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_read_at_max_lsn_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_read_at_max_lsn")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
-        let compact = false;
-        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let read_lsn = Lsn(u64::MAX - 1);

-        let result = tline.get(test_key, read_lsn, &ctx).await;
-        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
+        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());

        Ok(())
    }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        self.offset
    }

-    const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
+    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };

    /// Writes the given buffer directly to the underlying `VirtualFile`.
    /// You need to make sure that the internal buffer is empty, otherwise
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -369,10 +369,6 @@ pub struct TenantConf {
    // How much WAL must be ingested before checking again whether a new image layer is required.
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    pub switch_to_aux_file_v2: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -468,10 +464,6 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,
-
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub switch_to_aux_file_v2: Option<bool>,
 }

 impl TenantConfOpt {
@@ -529,9 +521,6 @@ impl TenantConfOpt {
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
-            switch_to_aux_file_v2: self
-                .switch_to_aux_file_v2
-                .unwrap_or(global_conf.switch_to_aux_file_v2),
        }
    }
 }
@@ -573,7 +562,6 @@ impl Default for TenantConf {
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_to_aux_file_v2: false,
        }
    }
 }
@@ -648,7 +636,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
-            switch_to_aux_file_v2: value.switch_to_aux_file_v2,
        }
    }
 }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,26 +3,36 @@

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache;
+use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
+use std::cmp::min;

-use std::io;
+use std::io::{self, ErrorKind};
+use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
+use tracing::*;
 use utils::id::TimelineId;

 pub struct EphemeralFile {
+    page_cache_file_id: page_cache::FileId,
+
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
-
-    rw: page_caching::RW,
+    file: VirtualFile,
+    len: u64,
+    /// An ephemeral file is append-only.
+    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
+    /// The other pages, which can no longer be modified, are accessed through the page cache.
+    ///
+    /// None <=> IO is ongoing.
+    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
+    mutable_tail: Option<BytesMut>,
 }

-mod page_caching;
-mod zero_padded_read_write;
-
 impl EphemeralFile {
    pub async fn create(
        conf: &PageServerConf,
@@ -49,18 +59,21 @@ impl EphemeralFile {
        .await?;

        Ok(EphemeralFile {
+            page_cache_file_id: page_cache::next_file_id(),
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file),
+            file,
+            len: 0,
+            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
        })
    }

    pub(crate) fn len(&self) -> u64 {
-        self.rw.bytes_written()
+        self.len
    }

-    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.rw.page_cache_file_id()
+    pub(crate) fn id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
    }

    pub(crate) async fn read_blk(
@@ -68,30 +81,182 @@ impl EphemeralFile {
        blknum: u32,
        ctx: &RequestContext,
    ) -> Result<BlockLease, io::Error> {
-        self.rw.read_blk(blknum, ctx).await
+        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        // order path before error because error is anyhow::Error => might have many contexts
+                        format!(
+                            "ephemeral file: read immutable page #{}: {}: {:#}",
+                            blknum, self.file.path, e,
+                        ),
+                    )
+                })? {
+                page_cache::ReadBufResult::Found(guard) => {
+                    return Ok(BlockLease::PageReadGuard(guard))
+                }
+                page_cache::ReadBufResult::NotFound(write_guard) => {
+                    let write_guard = self
+                        .file
+                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                        .await?;
+                    let read_guard = write_guard.mark_valid();
+                    return Ok(BlockLease::PageReadGuard(read_guard));
+                }
+            };
+        } else {
+            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(
+                self.mutable_tail
+                    .as_deref()
+                    .expect("we're not doing IO, it must be Some()")
+                    .try_into()
+                    .expect("we ensure that it's always PAGE_SZ"),
+            ))
+        }
    }

    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+        struct Writer<'a> {
+            ephemeral_file: &'a mut EphemeralFile,
+            /// The block to which the next [`push_bytes`] will write.
+            blknum: u32,
+            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
+            off: usize,
+        }
+        impl<'a> Writer<'a> {
+            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
+                Ok(Writer {
+                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
+                    ephemeral_file,
+                })
+            }
+            #[inline(always)]
+            async fn push_bytes(
+                &mut self,
+                src: &[u8],
+                ctx: &RequestContext,
+            ) -> Result<(), io::Error> {
+                let mut src_remaining = src;
+                while !src_remaining.is_empty() {
+                    let dst_remaining = &mut self
+                        .ephemeral_file
+                        .mutable_tail
+                        .as_deref_mut()
+                        .expect("IO is not yet ongoing")[self.off..];
+                    let n = min(dst_remaining.len(), src_remaining.len());
+                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
+                    self.off += n;
+                    src_remaining = &src_remaining[n..];
+                    if self.off == PAGE_SZ {
+                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
+                            .expect("IO is not yet ongoing");
+                        let (mutable_tail, res) = self
+                            .ephemeral_file
+                            .file
+                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
+                            .await;
+                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
+                        // I.e., the IO isn't retryable if we panic.
+                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
+                        match res {
+                            Ok(_) => {
+                                // Pre-warm the page cache with what we just wrote.
+                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
+                                let cache = page_cache::get();
+                                match cache
+                                    .read_immutable_buf(
+                                        self.ephemeral_file.page_cache_file_id,
+                                        self.blknum,
+                                        ctx,
+                                    )
+                                    .await
+                                {
+                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
+                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
+                                    }
+                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
+                                        let buf: &mut [u8] = write_guard.deref_mut();
+                                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                                        buf.copy_from_slice(
+                                            self.ephemeral_file
+                                                .mutable_tail
+                                                .as_deref()
+                                                .expect("IO is not ongoing"),
+                                        );
+                                        let _ = write_guard.mark_valid();
+                                        // pre-warm successful
+                                    }
+                                    Err(e) => {
+                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                                    }
+                                }
+                                // Zero the buffer for re-use.
+                                // Zeroing is critical for correcntess because the write_blob code below
+                                // and similarly read_blk expect zeroed pages.
+                                self.ephemeral_file
+                                    .mutable_tail
+                                    .as_deref_mut()
+                                    .expect("IO is not ongoing")
+                                    .fill(0);
+                                // This block is done, move to next one.
+                                self.blknum += 1;
+                                self.off = 0;
+                            }
+                            Err(e) => {
+                                return Err(std::io::Error::new(
+                                    ErrorKind::Other,
+                                    // order error before path because path is long and error is short
+                                    format!(
+                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
+                                        self.blknum,
+                                        e,
+                                        self.ephemeral_file.file.path,
+                                    ),
+                                ));
+                            }
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+
+        let pos = self.len;
+        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-
-            self.rw.write_all_borrowed(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        }

        // Write the payload
-        self.rw.write_all_borrowed(srcbuf).await?;
+        writer.push_bytes(srcbuf, ctx).await?;
+
+        if srcbuf.len() < 0x80 {
+            self.len += 1;
+        } else {
+            self.len += 4;
+        }
+        self.len += srcbuf.len() as u64;

        Ok(pos)
    }
@@ -106,6 +271,28 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        let res = std::fs::remove_file(&self.file.path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.file.path, e
+                );
+            }
+        }
+    }
+}
+
 impl BlockReader for EphemeralFile {
    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,218 +0,0 @@
-//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
-//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-
-use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::VirtualFile;
-
-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use tokio_epoll_uring::BoundedBuf;
-use tracing::*;
-
-use super::zero_padded_read_write;
-
-/// See module-level comment.
-pub struct RW {
-    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
-}
-
-impl RW {
-    pub fn new(file: VirtualFile) -> Self {
-        let page_cache_file_id = page_cache::next_file_id();
-        Self {
-            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-            )),
-        }
-    }
-
-    pub fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
-    pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
-        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
-        // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf).await
-    }
-
-    pub(crate) fn bytes_written(&self) -> u64 {
-        self.rw.bytes_written()
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        match self.rw.read_blk(blknum).await? {
-            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
-                let cache = page_cache::get();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.rw.as_writer().file.path,
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(write_guard) => {
-                        let write_guard = writer
-                            .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
-                            .await?;
-                        let read_guard = write_guard.mark_valid();
-                        return Ok(BlockLease::PageReadGuard(read_guard));
-                    }
-                }
-            }
-            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
-                Ok(BlockLease::EphemeralFileMutableTail(buffer))
-            }
-        }
-    }
-}
-
-impl Drop for RW {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
-            }
-        }
-    }
-}
-
-struct PreWarmingWriter {
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
-        Self {
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<
-        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
-        Buf: tokio_epoll_uring::IoBuf + Send,
-    >(
-        &mut self,
-        buf: B,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let buf = buf.slice(..);
-        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
-        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
-            Some(buf.to_vec())
-        } else {
-            None
-        };
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let iobuf = match self.file.write_all(buf).await {
-            (iobuf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                iobuf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
-        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
-        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
-            assert_eq!(&check_bounds_stuff_works, &*buf);
-        }
-
-        // Pre-warm page cache with the contents.
-        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-        // benefits the code that writes InMemoryLayer=>L0 layers.
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-        let cache = page_cache::get();
-        static CTX: Lazy<RequestContext> = Lazy::new(|| {
-            RequestContext::new(
-                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                crate::context::DownloadBehavior::Error,
-            )
-        });
-        for blknum_in_buffer in 0..nblocks {
-            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-            let blknum = self
-                .nwritten_blocks
-                .checked_add(blknum_in_buffer as u32)
-                .unwrap();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                .await
-            {
-                Err(e) => {
-                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                }
-                Ok(v) => match v {
-                    page_cache::ReadBufResult::Found(_guard) => {
-                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        write_guard.copy_from_slice(blk_in_buffer);
-                        let _ = write_guard.mark_valid();
-                    }
-                },
-            }
-        }
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf.into_inner()))
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -1,125 +0,0 @@
-//! The heart of how [`super::EphemeralFile`] does its reads and writes.
-//!
-//! # Writes
-//!
-//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
-//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
-//!
-//! # Reads
-//!
-//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
-//!
-//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
-//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
-//! if the read is for the prefix that has already been flushed.
-//!
-//! # Current Usage
-//!
-//! The current user of this module is [`super::page_caching::RW`].
-
-mod zero_padded;
-
-use crate::{
-    page_cache::PAGE_SZ,
-    virtual_file::owned_buffers_io::{
-        self,
-        write::{Buffer, OwnedAsyncWriter},
-    },
-};
-
-const TAIL_SZ: usize = 64 * 1024;
-
-/// See module-level comment.
-pub struct RW<W: OwnedAsyncWriter> {
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        zero_padded::Buffer<TAIL_SZ>,
-        owned_buffers_io::util::size_tracking_writer::Writer<W>,
-    >,
-}
-
-pub enum ReadResult<'a, W> {
-    NeedsReadFromWriter { writer: &'a W },
-    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
-}
-
-impl<W> RW<W>
-where
-    W: OwnedAsyncWriter,
-{
-    pub fn new(writer: W) -> Self {
-        let bytes_flushed_tracker =
-            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
-        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
-            bytes_flushed_tracker,
-            zero_padded::Buffer::default(),
-        );
-        Self { buffered_writer }
-    }
-
-    pub(crate) fn as_writer(&self) -> &W {
-        self.buffered_writer.as_inner().as_inner()
-    }
-
-    pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf).await
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        flushed_offset + u64::try_from(buffer.pending()).unwrap()
-    }
-
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
-        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
-
-        // The trailing page ("block") might only be partially filled,
-        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
-        // Moreover, it has to be zero-padded, because when we still had
-        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
-        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
-        // => check here that the read doesn't go beyond this potentially trailing
-        // => the zero-padding is done in the `else` branch below
-        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
-            buffered_offset / (PAGE_SZ as u64)
-        } else {
-            (buffered_offset / (PAGE_SZ as u64)) + 1
-        };
-        if (blknum as u64) >= blocks_written {
-            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
-        }
-
-        // assertions for the `if-else` below
-        assert_eq!(
-            flushed_offset % (TAIL_SZ as u64), 0,
-            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
-        );
-        assert_eq!(
-            flushed_offset % (PAGE_SZ as u64),
-            0,
-            "the logic below can't handle if the page is spread across the flushed part and the buffer"
-        );
-
-        if read_offset < flushed_offset {
-            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
-            Ok(ReadResult::NeedsReadFromWriter {
-                writer: self.as_writer(),
-            })
-        } else {
-            let read_offset_in_buffer = read_offset
-                .checked_sub(flushed_offset)
-                .expect("would have taken `if` branch instead of this one");
-            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
-            let zero_padded_slice = buffer.as_zero_padded_slice();
-            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
-            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
-                buffer: page
-                    .try_into()
-                    .expect("the slice above got it as page-size slice"),
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -1,108 +0,0 @@
-//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
-//! unwritten range is guaranteed to be zero-initialized.
-//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
-//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
-
-use std::mem::MaybeUninit;
-
-/// See module-level comment.
-pub struct Buffer<const N: usize> {
-    allocation: Box<[u8; N]>,
-    written: usize,
-}
-
-impl<const N: usize> Default for Buffer<N> {
-    fn default() -> Self {
-        Self {
-            allocation: Box::new(
-                // SAFETY: zeroed memory is a valid [u8; N]
-                unsafe { MaybeUninit::zeroed().assume_init() },
-            ),
-            written: 0,
-        }
-    }
-}
-
-impl<const N: usize> Buffer<N> {
-    #[inline(always)]
-    fn invariants(&self) {
-        // don't check by default, unoptimized is too expensive even for debug mode
-        if false {
-            debug_assert!(self.written <= N, "{}", self.written);
-            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
-        }
-    }
-
-    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
-        &self.allocation
-    }
-}
-
-impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
-    type IoBuf = Self;
-
-    fn cap(&self) -> usize {
-        self.allocation.len()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        self.invariants();
-        let remaining = self.allocation.len() - self.written;
-        if other.len() > remaining {
-            panic!("calling extend_from_slice() with insufficient remaining capacity");
-        }
-        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
-        self.written += other.len();
-        self.invariants();
-    }
-
-    fn pending(&self) -> usize {
-        self.written
-    }
-
-    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
-        self.invariants();
-        let written = self.written;
-        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
-    }
-
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
-        let Self {
-            mut allocation,
-            written,
-        } = iobuf;
-        allocation[0..written].fill(0);
-        let new = Self {
-            allocation,
-            written: 0,
-        };
-        new.invariants();
-        new
-    }
-}
-
-/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
-/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
-///
-/// Remember that bytes_init is generally _not_ a tracker of the amount
-/// of valid data in the io buffer; we use `Slice` for that.
-/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
-///
-/// SAFETY:
-///
-/// The [`Self::allocation`] is stable becauses boxes are stable.
-/// The memory is zero-initialized, so, bytes_init is always N.
-unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
-    fn stable_ptr(&self) -> *const u8 {
-        self.allocation.as_ptr()
-    }
-
-    fn bytes_init(&self) -> usize {
-        // Yes, N, not self.written; Read the full comment of this impl block!
-        N
-    }
-
-    fn bytes_total(&self) -> usize {
-        N
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -916,7 +916,6 @@ mod tests {
        assert_eq!(lhs, rhs);
    }

-    #[cfg(test)]
    fn brute_force_range_search(
        layer_map: &LayerMap,
        key_range: Range<Key>,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,6 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -254,15 +253,17 @@ impl TenantsMap {
    }
 }

-/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
-/// the slower actual deletion in the background.
-///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
+async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
+    let tmp_path = safe_rename_tenant_dir(path).await?;
+    fs::remove_dir_all(tmp_path).await
+}
+
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
    let parent = path
        .as_ref()
@@ -285,28 +286,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
    Ok(tmp_path)
 }

-/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-/// the background, and thereby avoid blocking any API requests on this deletion completing.
-fn spawn_background_purge(tmp_path: Utf8PathBuf) {
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-}
-
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

@@ -591,11 +570,7 @@ pub async fn init_tenant_mgr(
    );
    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

-    // Accumulate futures for writing tenant configs, so that we can execute in parallel
-    let mut config_write_futs = Vec::new();
-
-    // Update the location configs according to the re-attach response and persist them to disk
-    tracing::info!("Updating {} location configs", tenant_configs.len());
+    // Construct `Tenant` objects and start them running
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);

@@ -622,22 +597,18 @@ pub async fn init_tenant_mgr(
        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
            SecondaryLocationConfig { warm: true };

+        // Update the location config according to the re-attach response
        if let Some(tenant_modes) = &tenant_modes {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
            match tenant_modes.get(&tenant_shard_id) {
                None => {
                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-
-                    match safe_rename_tenant_dir(&tenant_dir_path).await {
-                        Ok(tmp_path) => {
-                            spawn_background_purge(tmp_path);
-                        }
-                        Err(e) => {
-                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
-                        }
-                    };
+                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                        );
+                    }

                    // We deleted local content: move on to next tenant, don't try and spawn this one.
                    continue;
@@ -683,32 +654,8 @@ pub async fn init_tenant_mgr(

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
-        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
-            (tenant_shard_id, location_conf, r)
-        });
-    }
+        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
-    tracing::info!(
-        "Writing {} location config files...",
-        config_write_futs.len()
-    );
-    let config_write_results = futures::stream::iter(config_write_futs)
-        .buffer_unordered(16)
-        .collect::<Vec<_>>()
-        .await;
-
-    tracing::info!(
-        "Spawning {} tenant shard locations...",
-        config_write_results.len()
-    );
-    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
-    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
-        config_write_result?;
-
-        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
            LocationMode::Attached(attached_conf) => {
@@ -1752,7 +1699,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1907,6 +1854,28 @@ impl TenantManager {
        shutdown_all_tenants0(self.tenants).await
    }

+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
    pub(crate) async fn detach_tenant(
        &self,
        conf: &'static PageServerConf,
@@ -1923,7 +1892,7 @@ impl TenantManager {
                deletion_queue_client,
            )
            .await?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        Ok(())
    }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -243,9 +243,7 @@ use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

-pub(crate) use download::{
-    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
-};
+pub(crate) use download::{is_temp_download_file, list_remote_timelines};
 pub(crate) use index::LayerFileMetadata;

 // Occasional network issues and such can cause remote operations to fail, and
@@ -474,7 +472,7 @@ impl RemoteTimelineClient {
            },
        );

-        let (index_part, _index_generation) = download::download_index_part(
+        let index_part = download::download_index_part(
            &self.storage_impl,
            &self.tenant_shard_id,
            &self.timeline_id,
@@ -1718,11 +1716,6 @@ impl RemoteTimelineClient {
    }
 }

-pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    let path = format!("tenants/{tenant_shard_id}");
-    RemotePath::from_string(&path).expect("Failed to construct path")
-}
-
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
    RemotePath::from_string(&path).expect("Failed to construct path")
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,7 +5,6 @@

 use std::collections::HashSet;
 use std::future::Future;
-use std::str::FromStr;

 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -26,13 +25,13 @@ use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+    INITDB_PATH,
 };

 ///
@@ -183,7 +182,6 @@ async fn download_object<'a>(
        #[cfg(target_os = "linux")]
        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
-            use bytes::BytesMut;
            async {
                let destination_file = VirtualFile::create(dst_path)
                    .await
@@ -196,10 +194,10 @@ async fn download_object<'a>(
                // There's chunks_vectored() on the stream.
                let (bytes_amount, destination_file) = async {
                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
-                        size_tracking,
-                        BytesMut::with_capacity(super::BUFFER_SIZE),
-                    );
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
+                        { super::BUFFER_SIZE },
+                        _,
+                    >::new(size_tracking);
                    while let Some(res) =
                        futures::StreamExt::next(&mut download.download_stream).await
                    {
@@ -254,31 +252,42 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
    }
 }

-async fn list_identifiers<T>(
+/// List timelines of given tenant in remote storage
+pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
-    prefix: RemotePath,
+    tenant_shard_id: TenantShardId,
    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<T>, HashSet<String>)>
-where
-    T: FromStr + Eq + std::hash::Hash,
-{
+) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
+
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
    let listing = download_retry_forever(
-        || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel),
-        &format!("list identifiers in prefix {prefix}"),
+        || {
+            storage.list(
+                Some(&remote_path),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+        },
+        &format!("list timelines for {tenant_shard_id}"),
        &cancel,
    )
    .await?;

-    let mut parsed_ids = HashSet::new();
+    let mut timeline_ids = HashSet::new();
    let mut other_prefixes = HashSet::new();

-    for id_remote_storage_key in listing.prefixes {
-        let object_name = id_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}")
+    for timeline_remote_storage_key in listing.prefixes {
+        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
+            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
        })?;

-        match object_name.parse::<T>() {
-            Ok(t) => parsed_ids.insert(t),
+        match object_name.parse::<TimelineId>() {
+            Ok(t) => timeline_ids.insert(t),
            Err(_) => other_prefixes.insert(object_name.to_string()),
        };
    }
@@ -290,31 +299,7 @@ where
        other_prefixes.insert(object_name.to_string());
    }

-    Ok((parsed_ids, other_prefixes))
-}
-
-/// List shards of given tenant in remote storage
-pub(crate) async fn list_remote_tenant_shards(
-    storage: &GenericRemoteStorage,
-    tenant_id: TenantId,
-    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TenantShardId>, HashSet<String>)> {
-    let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id));
-    list_identifiers::<TenantShardId>(storage, remote_path, cancel).await
-}
-
-/// List timelines of given tenant shard in remote storage
-pub async fn list_remote_timelines(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: TenantShardId,
-    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
-        anyhow::bail!("storage-sync-list-remote-timelines");
-    });
-
-    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
-    list_identifiers::<TimelineId>(storage, remote_path, cancel).await
+    Ok((timeline_ids, other_prefixes))
 }

 async fn do_download_index_part(
@@ -323,7 +308,7 @@ async fn do_download_index_part(
    timeline_id: &TimelineId,
    index_generation: Generation,
    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

    let index_part_bytes = download_retry_forever(
@@ -348,7 +333,7 @@ async fn do_download_index_part(
        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

-    Ok((index_part, index_generation))
+    Ok(index_part)
 }

 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -357,13 +342,13 @@ async fn do_download_index_part(
 /// In this function we probe for the most recent index in a generation <= our current generation.
 /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(crate) async fn download_index_part(
+pub(super) async fn download_index_part(
    storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    my_generation: Generation,
    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -118,9 +118,6 @@ pub(super) async fn gather_inputs(
    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    //
-    // FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
-    // whole computation. It does not make sense from the billing perspective.
    tenant
        .refresh_gc_info(cancel, ctx)
        .await
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -17,7 +17,7 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, BinaryHeap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -78,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer {
 }

 pub struct InMemoryLayerInner {
-    /// All versions of all pages in the layer are kept here. Indexed
+    /// All versions of all pages in the layer are kept here.  Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<Key, VecMap<Lsn, u64>>,
+    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -384,20 +384,25 @@ impl InMemoryLayer {
        let mut planned_block_reads = BinaryHeap::new();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner.index.range(range.start..range.end) {
-                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
-                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                    None => self.start_lsn..end_lsn,
-                };
+            let mut key = range.start;
+            while key < range.end {
+                if let Some(vec_map) = inner.index.get(&key) {
+                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                        None => self.start_lsn..end_lsn,
+                    };

-                let slice = vec_map.slice_range(lsn_range);
-                for (entry_lsn, pos) in slice.iter().rev() {
-                    planned_block_reads.push(BlockRead {
-                        key: *key,
-                        lsn: *entry_lsn,
-                        block_offset: *pos,
-                    });
+                    let slice = vec_map.slice_range(lsn_range);
+                    for (entry_lsn, pos) in slice.iter().rev() {
+                        planned_block_reads.push(BlockRead {
+                            key,
+                            lsn: *entry_lsn,
+                            block_offset: *pos,
+                        });
+                    }
                }
+
+                key = key.next();
            }
        }

@@ -477,7 +482,7 @@ impl InMemoryLayer {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
-        let key = InMemoryLayerFileId(file.page_cache_file_id());
+        let key = InMemoryLayerFileId(file.id());

        Ok(InMemoryLayer {
            file_id: key,
@@ -494,7 +499,7 @@ impl InMemoryLayer {
            end_lsn: OnceLock::new(),
            opened_at: Instant::now(),
            inner: RwLock::new(InMemoryLayerInner {
-                index: BTreeMap::new(),
+                index: HashMap::new(),
                file,
                resource_units: GlobalResourceUnits::new(),
            }),
@@ -597,17 +602,14 @@ impl InMemoryLayer {
        }
    }

-    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
-    /// layer will only contain the key range the user specifies, and may return `None`
-    /// if there are no matching keys.
+    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
    pub(crate) async fn write_to_disk(
        &self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+    ) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -621,21 +623,6 @@ impl InMemoryLayer {

        let end_lsn = *self.end_lsn.get().unwrap();

-        let keys: Vec<_> = if let Some(key_range) = key_range {
-            inner
-                .index
-                .iter()
-                .filter(|(k, _)| key_range.contains(k))
-                .map(|(k, m)| (k.to_i128(), m))
-                .collect()
-        } else {
-            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
-        };
-
-        if keys.is_empty() {
-            return Ok(None);
-        }
-
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
@@ -649,17 +636,26 @@ impl InMemoryLayer {

        let cursor = inner.file.block_cursor();

+        // Sort the keys because delta layer writer expects them sorted.
+        //
+        // NOTE: this sort can take up significant time if the layer has millions of
+        //       keys. To speed up all the comparisons we convert the key to i128 and
+        //       keep the value as a reference.
+        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
+        keys.sort_unstable_by_key(|k| k.0);
+
        let ctx = RequestContextBuilder::extend(ctx)
            .page_content_kind(PageContentKind::InMemoryLayer)
            .build();
-        for (key, vec_map) in inner.index.iter() {
+        for (key, vec_map) in keys.iter() {
+            let key = Key::from_i128(*key);
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                let will_init = Value::des(&buf)?.will_init();
                let res;
                (buf, res) = delta_layer_writer
-                    .put_value_bytes(*key, *lsn, buf, will_init)
+                    .put_value_bytes(key, *lsn, buf, will_init)
                    .await;
                res?;
            }
@@ -667,6 +663,6 @@ impl InMemoryLayer {

        // MAX is used here because we identify L0 layers by full key range
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
-        Ok(Some(delta_layer))
+        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,7 +17,7 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
-    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
+    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
@@ -55,6 +55,7 @@ use std::{
    ops::ControlFlow,
 };

+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -65,7 +66,6 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
-use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -137,25 +137,6 @@ pub(super) enum FlushLoopState {
    Exited,
 }

-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub enum ImageLayerCreationMode {
-    /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
-    Try,
-    /// Force creating the image layers if possible. For now, no image layers will be created
-    /// for metadata keys. Used in compaction code path with force flag enabled.
-    Force,
-    /// Initial ingestion of the data, and no data should be dropped in this function. This
-    /// means that no metadata keys should be included in the partitions. Used in flush frozen layer
-    /// code path.
-    Initial,
-}
-
-impl std::fmt::Display for ImageLayerCreationMode {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) struct Hole {
@@ -336,7 +317,7 @@ pub struct Timeline {
    pub initdb_lsn: Lsn,

    /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,

    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,
@@ -1168,11 +1149,6 @@ impl Timeline {
                panic!(concat!("Sequential get failed with {}, but vectored get did not",
                               " - keyspace={:?} lsn={}"),
                       seq_err, keyspace, lsn) },
-            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
-                // Sequential get runs after vectored get, so it is possible for the later 
-                // to time out while waiting for its ancestor's Lsn to become ready and for the
-                // former to succeed (it essentially has a doubled wait time).
-            },
            (Ok(_), Err(vec_err)) => {
                panic!(concat!("Vectored get failed with {}, but sequential get did not",
                               " - keyspace={:?} lsn={}"),
@@ -1253,12 +1229,6 @@ impl Timeline {
        self.last_record_lsn.load()
    }

-    /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
-    /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
-    pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
-        self.last_record_lsn.status_receiver()
-    }
-
    pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn.load()
    }
@@ -1901,15 +1871,6 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;

 // Private functions
 impl Timeline {
-    #[allow(dead_code)]
-    pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .switch_to_aux_file_v2
-            .unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
-    }
-
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2129,10 +2090,7 @@ impl Timeline {
                    // initial logical size is 0.
                    LogicalSize::empty_initial()
                },
-                partitioning: tokio::sync::Mutex::new((
-                    (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
-                    Lsn(0),
-                )),
+                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),

@@ -3134,6 +3092,7 @@ impl Timeline {
            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                let layer = guard.get_from_desc(&layer);
                drop(guard);
+
                // Get all the data needed to reconstruct the page version from this layer.
                // But if we have an older cached page image, no need to go past that.
                let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3254,7 +3213,7 @@ impl Timeline {
        Ok(())
    }

-    /// Collect the reconstruct data for a keyspace from the specified timeline.
+    /// Collect the reconstruct data for a ketspace from the specified timeline.
    ///
    /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
    /// the current keyspace. The current keyspace of the search at any given timeline
@@ -3683,103 +3642,66 @@ impl Timeline {
        // files instead. This is possible as long as *all* the data imported into the
        // repository have the same LSN.
        let lsn_range = frozen_layer.get_lsn_range();
-
-        // Whether to directly create image layers for this flush, or flush them as delta layers
-        let create_image_layer =
-            lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
-
-        #[cfg(test)]
-        {
-            match &mut *self.flush_loop_state.lock().unwrap() {
-                FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                    panic!("flush loop not running")
-                }
-                FlushLoopState::Running {
-                    expect_initdb_optimization,
-                    initdb_optimization_count,
-                    ..
-                } => {
-                    if create_image_layer {
+        let (layers_to_upload, delta_layer_to_add) =
+            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+                #[cfg(test)]
+                match &mut *self.flush_loop_state.lock().unwrap() {
+                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                        panic!("flush loop not running")
+                    }
+                    FlushLoopState::Running {
+                        initdb_optimization_count,
+                        ..
+                    } => {
                        *initdb_optimization_count += 1;
-                    } else {
+                    }
+                }
+                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+                // require downloading anything during initial import.
+                let (partitioning, _lsn) = self
+                    .repartition(
+                        self.initdb_lsn,
+                        self.get_compaction_target_size(),
+                        EnumSet::empty(),
+                        ctx,
+                    )
+                    .await?;
+
+                if self.cancel.is_cancelled() {
+                    return Err(FlushLayerError::Cancelled);
+                }
+
+                // For image layers, we add them immediately into the layer map.
+                (
+                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
+                        .await?,
+                    None,
+                )
+            } else {
+                #[cfg(test)]
+                match &mut *self.flush_loop_state.lock().unwrap() {
+                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                        panic!("flush loop not running")
+                    }
+                    FlushLoopState::Running {
+                        expect_initdb_optimization,
+                        ..
+                    } => {
                        assert!(!*expect_initdb_optimization, "expected initdb optimization");
                    }
                }
-            }
-        }
-
-        let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
-            // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
-            // require downloading anything during initial import.
-            let ((rel_partition, metadata_partition), _lsn) = self
-                .repartition(
-                    self.initdb_lsn,
-                    self.get_compaction_target_size(),
-                    EnumSet::empty(),
-                    ctx,
+                // Normal case, write out a L0 delta layer file.
+                // `create_delta_layer` will not modify the layer map.
+                // We will remove frozen layer and add delta layer in one atomic operation later.
+                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
+                (
+                    // FIXME: even though we have a single image and single delta layer assumption
+                    // we push them to vec
+                    vec![layer.clone()],
+                    Some(layer),
                )
-                .await?;
-
-            if self.cancel.is_cancelled() {
-                return Err(FlushLayerError::Cancelled);
-            }
-
-            // For metadata, always create delta layers.
-            let delta_layer = if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single aux file keyspace"
-                );
-                let metadata_keyspace = &metadata_partition.parts[0];
-                assert_eq!(
-                    metadata_keyspace.0.ranges.len(),
-                    1,
-                    "aux file keyspace should be a single range"
-                );
-                self.create_delta_layer(
-                    &frozen_layer,
-                    ctx,
-                    Some(metadata_keyspace.0.ranges[0].clone()),
-                )
-                .await?
-            } else {
-                None
            };

-            // For image layers, we add them immediately into the layer map.
-            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
-                    &rel_partition,
-                    self.initdb_lsn,
-                    ImageLayerCreationMode::Initial,
-                    ctx,
-                )
-                .await?,
-            );
-
-            if let Some(delta_layer) = delta_layer {
-                layers_to_upload.push(delta_layer.clone());
-                (layers_to_upload, Some(delta_layer))
-            } else {
-                (layers_to_upload, None)
-            }
-        } else {
-            // Normal case, write out a L0 delta layer file.
-            // `create_delta_layer` will not modify the layer map.
-            // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
-                panic!("delta layer cannot be empty if no filter is applied");
-            };
-            (
-                // FIXME: even though we have a single image and single delta layer assumption
-                // we push them to vec
-                vec![layer.clone()],
-                Some(layer),
-            )
-        };
-
        pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");

        if self.cancel.is_cancelled() {
@@ -3899,18 +3821,12 @@ impl Timeline {
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
        ctx: &RequestContext,
-        key_range: Option<Range<Key>>,
-    ) -> anyhow::Result<Option<ResidentLayer>> {
+    ) -> anyhow::Result<ResidentLayer> {
        let self_clone = Arc::clone(self);
        let frozen_layer = Arc::clone(frozen_layer);
        let ctx = ctx.attached_child();
        let work = async move {
-            let Some(new_delta) = frozen_layer
-                .write_to_disk(&self_clone, &ctx, key_range)
-                .await?
-            else {
-                return Ok(None);
-            };
+            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
            // We just need to fsync the directory in which these inodes are linked,
            // which we know to be the timeline directory.
@@ -3929,7 +3845,7 @@ impl Timeline {
                .sync_all()
                .await
                .fatal_err("VirtualFile::sync_all timeline dir");
-            anyhow::Ok(Some(new_delta))
+            anyhow::Ok(new_delta)
        };
        // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
        // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3956,20 +3872,19 @@ impl Timeline {
        partition_size: u64,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
+    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
            // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
            // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
            // and hence before the compaction task starts.
            anyhow::bail!("repartition() called concurrently, this should not happen");
        };
-        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
-        if lsn < *partition_lsn {
+        if lsn < partitioning_guard.1 {
            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
        }

-        let distance = lsn.0 - partition_lsn.0;
-        if *partition_lsn != Lsn(0)
+        let distance = lsn.0 - partitioning_guard.1 .0;
+        if partitioning_guard.1 != Lsn(0)
            && distance <= self.repartition_threshold
            && !flags.contains(CompactFlags::ForceRepartition)
        {
@@ -3978,24 +3893,37 @@ impl Timeline {
                threshold = self.repartition_threshold,
                "no repartitioning needed"
            );
-            return Ok((
-                (dense_partition.clone(), sparse_partition.clone()),
-                *partition_lsn,
-            ));
+            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
        }

-        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
-        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
-        let sparse_partitioning = SparseKeyPartitioning {
-            parts: vec![sparse_ks],
-        }; // no partitioning for metadata keys for now
-        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
+        let keyspace = self.collect_keyspace(lsn, ctx).await?;
+        let partitioning = keyspace.partition(&self.shard_identity, partition_size);
+
+        *partitioning_guard = (partitioning, lsn);

        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
    }

    // Is it time to create a new image layer for the given partition?
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+        let last = self.last_image_layer_creation_check_at.load();
+        if lsn != Lsn(0) {
+            let distance = lsn
+                .checked_sub(last)
+                .expect("Attempt to compact with LSN going backwards");
+
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting below if we've not ingested
+            // sufficient WAL since the last check.
+            if distance.0 < min_distance {
+                return false;
+            }
+        }
+
+        self.last_image_layer_creation_check_at.store(lsn);
+
        let threshold = self.get_image_creation_threshold();

        let guard = self.layers.read().await;
@@ -4045,12 +3973,12 @@ impl Timeline {
        false
    }

-    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
+    #[tracing::instrument(skip_all, fields(%lsn, %force))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
        partitioning: &KeyPartitioning,
        lsn: Lsn,
-        mode: ImageLayerCreationMode,
+        force: bool,
        ctx: &RequestContext,
    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
        let timer = self.metrics.create_images_time_histo.start_timer();
@@ -4067,46 +3995,11 @@ impl Timeline {
        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
        let mut start = Key::MIN;

-        let check_for_image_layers = {
-            let last_checks_at = self.last_image_layer_creation_check_at.load();
-            let distance = lsn
-                .checked_sub(last_checks_at)
-                .expect("Attempt to compact with LSN going backwards");
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
-            // WAL since the last check.
-            distance.0 >= min_distance
-        };
-
-        if check_for_image_layers {
-            self.last_image_layer_creation_check_at.store(lsn);
-        }
-
        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;
-
-            if partition.overlaps(&Key::metadata_key_range()) {
-                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
-                // rather big change. Keep this patch small for now.
-                match mode {
-                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
-                        // skip image layer creation anyways for metadata keys.
-                        start = img_range.end;
-                        continue;
-                    }
-                    ImageLayerCreationMode::Initial => {
-                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                    }
-                }
-            } else if let ImageLayerCreationMode::Try = mode {
-                // check_for_image_layers = false -> skip
-                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
-                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
-                    start = img_range.end;
-                    continue;
-                }
+            if !force && !self.time_for_new_image_layer(partition, lsn).await {
+                start = img_range.end;
+                continue;
            }

            let mut image_layer_writer = ImageLayerWriter::new(
@@ -4433,12 +4326,6 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let _timer = self
-            .metrics
-            .update_gc_info_histo
-            .start_timer()
-            .record_on_drop();
-
        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
        //
        // Some unit tests depend on garbage-collection working even when
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
+use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -102,7 +102,7 @@ impl Timeline {
            )
            .await
        {
-            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+            Ok((partitioning, lsn)) => {
                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                let image_ctx = RequestContextBuilder::extend(ctx)
                    .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,37 +115,17 @@ impl Timeline {

                // 3. Create new image layers for partitions that have been modified
                // "enough".
-                let dense_layers = self
+                let layers = self
                    .create_image_layers(
-                        &dense_partitioning,
+                        &partitioning,
                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
                        &image_ctx,
                    )
                    .await
                    .map_err(anyhow::Error::from)?;

-                // For now, nothing will be produced...
-                let sparse_layers = self
-                    .create_image_layers(
-                        &sparse_partitioning.clone().into_dense(),
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                assert!(sparse_layers.is_empty());
-
-                self.upload_new_image_layers(dense_layers)?;
+                self.upload_new_image_layers(layers)?;
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -778,9 +758,8 @@ impl Timeline {
            return Err(CompactionError::ShuttingDown);
        }

-        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
-        // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
-        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
+        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));

        pageserver_compaction::compact_tiered::compact_tiered(
            &mut adaptor,
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -22,12 +22,10 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
-
+use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::proto::{
-    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
-    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
-};
 use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -91,14 +89,6 @@ pub(super) async fn connection_manager_loop_step(
        .timeline
        .subscribe_for_state_updates();

-    let mut wait_lsn_status = connection_manager_state
-        .timeline
-        .subscribe_for_wait_lsn_updates();
-
-    // TODO: create a separate config option for discovery request interval
-    let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
-    let mut last_discovery_ts: Option<std::time::Instant> = None;
-
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
@@ -107,12 +97,10 @@ pub(super) async fn connection_manager_loop_step(

    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
-        let any_activity = connection_manager_state.wal_connection.is_some()
-            || !connection_manager_state.wal_stream_candidates.is_empty();

        // These things are happening concurrently:
        //
-        //  - cancellation request
+        // - cancellation request
        //  - keep receiving WAL on the current connection
        //      - if the shared state says we need to change connection, disconnect and return
        //      - this runs in a separate task and we receive updates via a watch channel
@@ -120,7 +108,6 @@ pub(super) async fn connection_manager_loop_step(
        //  - receive updates from broker
        //      - this might change the current desired connection
        //  - timeline state changes to something that does not allow walreceiver to run concurrently
-        //  - if there's no connection and no candidates, try to send a discovery request

        // NB: make sure each of the select expressions are cancellation-safe
        // (no need for arms to be cancellation-safe).
@@ -227,65 +214,6 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
-
-            Some(()) = async {
-                // Reminder: this match arm needs to be cancellation-safe.
-                // Calculating time needed to wait until sending the next discovery request.
-                // Current implementation is conservative and sends discovery requests only when there are no candidates.
-
-                if any_activity {
-                    // No need to send discovery requests if there is an active connection or candidates.
-                    return None;
-                }
-
-                // Waiting for an active wait_lsn request.
-                while wait_lsn_status.borrow().is_none() {
-                    if wait_lsn_status.changed().await.is_err() {
-                        // wait_lsn_status channel was closed, exiting
-                        warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
-                        return None;
-                    }
-                }
-
-                // All preconditions met, preparing to send a discovery request.
-                let now = std::time::Instant::now();
-                let next_discovery_ts = last_discovery_ts
-                    .map(|ts| ts + discovery_request_interval)
-                    .unwrap_or_else(|| now);
-
-                if next_discovery_ts > now {
-                    // Prevent sending discovery requests too frequently.
-                    tokio::time::sleep(next_discovery_ts - now).await;
-                }
-
-                let tenant_timeline_id = Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                });
-                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
-                let msg = TypedMessage {
-                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
-                    safekeeper_timeline_info: None,
-                    safekeeper_discovery_request: Some(request),
-                    safekeeper_discovery_response: None,
-                    };
-
-                last_discovery_ts = Some(std::time::Instant::now());
-                debug!("No active connection and no candidates, sending discovery request to the broker");
-
-                // Cancellation safety: we want to send a message to the broker, but publish_one()
-                // function can get cancelled by the other select! arm. This is absolutely fine, because
-                // we just want to receive broker updates and discovery is not important if we already
-                // receive updates.
-                //
-                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
-                // This is totally fine because of the reason above.
-
-                // This is a fire-and-forget request, we don't care about the response
-                let _ = broker_client.publish_one(msg).await;
-                debug!("Discovery request sent to the broker");
-                None
-            } => {}
        }

        if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -303,7 +231,7 @@ async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
    id: TenantTimelineId,
    cancel: &CancellationToken,
-) -> Result<Streaming<TypedMessage>, Cancelled> {
+) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
    let mut attempt = 0;
    loop {
        exponential_backoff(
@@ -316,27 +244,17 @@ async fn subscribe_for_timeline_updates(
        attempt += 1;

        // subscribe to the specific timeline
-        let request = SubscribeByFilterRequest {
-            types: vec![
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperTimelineInfo as i32,
-                },
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
-                },
-            ],
-            tenant_timeline_id: Some(FilterTenantTimelineId {
-                enabled: true,
-                tenant_timeline_id: Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                }),
-            }),
+        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
+            tenant_id: id.tenant_id.as_ref().to_owned(),
+            timeline_id: id.timeline_id.as_ref().to_owned(),
+        });
+        let request = SubscribeSafekeeperInfoRequest {
+            subscription_key: Some(key),
        };

        match {
            tokio::select! {
-                r = broker_client.subscribe_by_filter(request) => { r }
+                r = broker_client.subscribe_safekeeper_info(request) => { r }
                _ = cancel.cancelled() => { return Err(Cancelled); }
            }
        } {
@@ -480,7 +398,7 @@ struct RetryInfo {
 /// Data about the timeline to connect to, received from the broker.
 #[derive(Debug, Clone)]
 struct BrokerSkTimeline {
-    timeline: SafekeeperDiscoveryResponse,
+    timeline: SafekeeperTimelineInfo,
    /// Time at which the data was fetched from the broker last time, to track the stale data.
    latest_update: NaiveDateTime,
 }
@@ -688,41 +606,7 @@ impl ConnectionManagerState {
    }

    /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
-        let mut is_discovery = false;
-        let timeline_update = match typed_msg.r#type() {
-            MessageType::SafekeeperTimelineInfo => {
-                let info = match typed_msg.safekeeper_timeline_info {
-                    Some(info) => info,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_timeline_info");
-                        return;
-                    }
-                };
-                SafekeeperDiscoveryResponse {
-                    safekeeper_id: info.safekeeper_id,
-                    tenant_timeline_id: info.tenant_timeline_id,
-                    commit_lsn: info.commit_lsn,
-                    safekeeper_connstr: info.safekeeper_connstr,
-                    availability_zone: info.availability_zone,
-                }
-            }
-            MessageType::SafekeeperDiscoveryResponse => {
-                is_discovery = true;
-                match typed_msg.safekeeper_discovery_response {
-                    Some(response) => response,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_discovery_response");
-                        return;
-                    }
-                }
-            }
-            _ => {
-                // unexpected message
-                return;
-            }
-        };
-
+    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
        WALRECEIVER_BROKER_UPDATES.inc();

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -735,11 +619,7 @@ impl ConnectionManagerState {
        );

        if old_entry.is_none() {
-            info!(
-                ?is_discovery,
-                %new_safekeeper_id,
-                "New SK node was added",
-            );
+            info!("New SK node was added: {new_safekeeper_id}");
            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }
@@ -938,7 +818,7 @@ impl ConnectionManagerState {
    fn select_connection_candidate(
        &self,
        node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
        self.applicable_connection_candidates()
            .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
            .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -948,7 +828,7 @@ impl ConnectionManagerState {
    /// Some safekeepers are filtered by the retry cooldown.
    fn applicable_connection_candidates(
        &self,
-    ) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
        let now = Utc::now().naive_utc();

        self.wal_stream_candidates
@@ -1088,11 +968,19 @@ mod tests {
        latest_update: NaiveDateTime,
    ) -> BrokerSkTimeline {
        BrokerSkTimeline {
-            timeline: SafekeeperDiscoveryResponse {
+            timeline: SafekeeperTimelineInfo {
                safekeeper_id: 0,
                tenant_timeline_id: None,
+                term: 0,
+                last_log_term: 0,
+                flush_lsn: 0,
                commit_lsn,
+                backup_lsn: 0,
+                remote_consistent_lsn: 0,
+                peer_horizon_lsn: 0,
+                local_start_lsn: 0,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
+                http_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
            },
            latest_update,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -32,11 +32,11 @@ pub use io_engine::feature_test as io_engine_feature_test;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
-use self::owned_buffers_io::write::OwnedAsyncWriter;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;

+#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
 pub(crate) mod owned_buffers_io {
    //! Abstractions for IO with owned buffers.
    //!
@@ -1083,17 +1083,6 @@ impl Drop for VirtualFile {
    }
 }

-impl OwnedAsyncWriter for VirtualFile {
-    #[inline(always)]
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
-        &mut self,
-        buf: B,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let (buf, res) = VirtualFile::write_all(self, buf).await;
-        res.map(move |v| (v, buf))
-    }
-}
-
 impl OpenFiles {
    fn new(num_slots: usize) -> OpenFiles {
        let mut slots = Box::new(Vec::with_capacity(num_slots));
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,45 +1,33 @@
-use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
+use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
 use tokio_epoll_uring::{BoundedBuf, IoBuf};

-pub struct Writer<W> {
-    dst: W,
+pub struct Writer {
+    dst: VirtualFile,
    bytes_amount: u64,
 }

-impl<W> Writer<W> {
-    pub fn new(dst: W) -> Self {
+impl Writer {
+    pub fn new(dst: VirtualFile) -> Self {
        Self {
            dst,
            bytes_amount: 0,
        }
    }
-
-    pub fn bytes_written(&self) -> u64 {
-        self.bytes_amount
-    }
-
-    pub fn as_inner(&self) -> &W {
-        &self.dst
-    }
-
    /// Returns the wrapped `VirtualFile` object as well as the number
    /// of bytes that were written to it through this object.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub fn into_inner(self) -> (u64, W) {
+    pub fn into_inner(self) -> (u64, VirtualFile) {
        (self.bytes_amount, self.dst)
    }
 }

-impl<W> OwnedAsyncWriter for Writer<W>
-where
-    W: OwnedAsyncWriter,
-{
+impl OwnedAsyncWriter for Writer {
    #[inline(always)]
    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
        buf: B,
    ) -> std::io::Result<(usize, B::Buf)> {
-        let (nwritten, buf) = self.dst.write_all(buf).await?;
+        let (buf, res) = self.dst.write_all(buf).await;
+        let nwritten = res?;
        self.bytes_amount += u64::try_from(nwritten).unwrap();
        Ok((nwritten, buf))
    }
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -10,14 +10,14 @@ pub trait OwnedAsyncWriter {
    ) -> std::io::Result<(usize, B::Buf)>;
 }

-/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
-/// small writes into larger writes of size [`Buffer::cap`].
+/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
+/// into `BUFFER_SIZE`-sized writes.
 ///
 /// # Passthrough Of Large Writers
 ///
-/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
-/// cause the internal buffer to be flushed prematurely so that the large
-/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
+/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
+/// buffer to be flushed, even if it is not full yet. Then, the large
+/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
 ///
 /// This pass-through is generally beneficial for throughput, but if
 /// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
@@ -25,38 +25,27 @@ pub trait OwnedAsyncWriter {
 ///
 /// In such cases, a different implementation that always buffers in memory
 /// may be preferable.
-pub struct BufferedWriter<B, W> {
+pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
    writer: W,
-    /// invariant: always remains Some(buf) except
-    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
-    /// - after an IO error => stays `None` forever
-    /// In these exceptional cases, it's `None`.
-    buf: Option<B>,
+    // invariant: always remains Some(buf)
+    // with buf.capacity() == BUFFER_SIZE except
+    // - while IO is ongoing => goes back to Some() once the IO completed successfully
+    // - after an IO error => stays `None` forever
+    // In these exceptional cases, it's `None`.
+    buf: Option<BytesMut>,
 }

-impl<B, Buf, W> BufferedWriter<B, W>
+impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
 where
-    B: Buffer<IoBuf = Buf> + Send,
-    Buf: IoBuf + Send,
    W: OwnedAsyncWriter,
 {
-    pub fn new(writer: W, buf: B) -> Self {
+    pub fn new(writer: W) -> Self {
        Self {
            writer,
-            buf: Some(buf),
+            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
        }
    }

-    pub fn as_inner(&self) -> &W {
-        &self.writer
-    }
-
-    /// Panics if used after any of the write paths returned an error
-    pub fn inspect_buffer(&self) -> &B {
-        self.buf()
-    }
-
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
        self.flush().await?;
        let Self { buf, writer } = self;
@@ -64,144 +53,61 @@ where
        Ok(writer)
    }

-    #[inline(always)]
-    fn buf(&self) -> &B {
-        self.buf
-            .as_ref()
-            .expect("must not use after we returned an error")
-    }
-
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
+    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
    where
-        S: IoBuf + Send,
+        B: IoBuf + Send,
    {
-        let chunk_len = chunk.len();
        // avoid memcpy for the middle of the chunk
-        if chunk.len() >= self.buf().cap() {
+        if chunk.len() >= BUFFER_SIZE {
            self.flush().await?;
            // do a big write, bypassing `buf`
            assert_eq!(
                self.buf
                    .as_ref()
                    .expect("must not use after an error")
-                    .pending(),
+                    .len(),
                0
            );
+            let chunk_len = chunk.len();
            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
            assert_eq!(nwritten, chunk_len);
-            return Ok((nwritten, chunk));
+            drop(chunk);
+            return Ok(());
        }
        // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < self.buf().cap());
-        let mut slice = &chunk[..];
-        while !slice.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
-            let have = slice.len();
-            let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&slice[..n]);
-            slice = &slice[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
-                self.flush().await?;
-            }
-        }
-        assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, chunk.into_inner()))
-    }
-
-    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
-    ///
-    /// It is less performant because we always have to copy the borrowed data into the internal buffer
-    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
-    /// for large writes.
-    pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
-        let chunk_len = chunk.len();
+        assert!(chunk.len() < BUFFER_SIZE);
+        let mut chunk = &chunk[..];
        while !chunk.is_empty() {
            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
+            let need = BUFFER_SIZE - buf.len();
            let have = chunk.len();
            let n = std::cmp::min(need, have);
            buf.extend_from_slice(&chunk[..n]);
            chunk = &chunk[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
+            if buf.len() >= BUFFER_SIZE {
+                assert_eq!(buf.len(), BUFFER_SIZE);
                self.flush().await?;
            }
        }
-        Ok(chunk_len)
+        assert!(chunk.is_empty(), "by now we should have drained the chunk");
+        Ok(())
    }

    async fn flush(&mut self) -> std::io::Result<()> {
        let buf = self.buf.take().expect("must not use after an error");
-        let buf_len = buf.pending();
-        if buf_len == 0 {
+        if buf.is_empty() {
            self.buf = Some(buf);
-            return Ok(());
+            return std::io::Result::Ok(());
        }
-        let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
+        let buf_len = buf.len();
+        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
        assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(io_buf));
+        buf.clear();
+        self.buf = Some(buf);
        Ok(())
    }
 }

-/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones.
-pub trait Buffer {
-    type IoBuf: IoBuf;
-
-    /// Capacity of the buffer. Must not change over the lifetime `self`.`
-    fn cap(&self) -> usize;
-
-    /// Add data to the buffer.
-    /// Panics if there is not enough room to accomodate `other`'s content, i.e.,
-    /// panics if `other.len() > self.cap() - self.pending()`.
-    fn extend_from_slice(&mut self, other: &[u8]);
-
-    /// Number of bytes in the buffer.
-    fn pending(&self) -> usize;
-
-    /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
-    /// so we can use [`tokio_epoll_uring`] to write it to disk.
-    fn flush(self) -> Slice<Self::IoBuf>;
-
-    /// After the write to disk is done and we have gotten back the slice,
-    /// [`BufferedWriter`] uses this method to re-use the io buffer.
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
-}
-
-impl Buffer for BytesMut {
-    type IoBuf = BytesMut;
-
-    #[inline(always)]
-    fn cap(&self) -> usize {
-        self.capacity()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        BytesMut::extend_from_slice(self, other)
-    }
-
-    #[inline(always)]
-    fn pending(&self) -> usize {
-        self.len()
-    }
-
-    fn flush(self) -> Slice<BytesMut> {
-        if self.is_empty() {
-            return self.slice_full();
-        }
-        let len = self.len();
-        self.slice(0..len)
-    }
-
-    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
-        iobuf.clear();
-        iobuf
-    }
-}
-
 impl OwnedAsyncWriter for Vec<u8> {
    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
@@ -219,8 +125,6 @@ impl OwnedAsyncWriter for Vec<u8> {

 #[cfg(test)]
 mod tests {
-    use bytes::BytesMut;
-
    use super::*;

    #[derive(Default)]
@@ -254,7 +158,7 @@ mod tests {
    #[tokio::test]
    async fn test_buffered_writes_only() -> std::io::Result<()> {
        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
        write!(writer, b"a");
        write!(writer, b"b");
        write!(writer, b"c");
@@ -271,7 +175,7 @@ mod tests {
    #[tokio::test]
    async fn test_passthrough_writes_only() -> std::io::Result<()> {
        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
        write!(writer, b"abc");
        write!(writer, b"de");
        write!(writer, b"");
@@ -287,7 +191,7 @@ mod tests {
    #[tokio::test]
    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
        write!(writer, b"a");
        write!(writer, b"bc");
        write!(writer, b"d");
@@ -299,31 +203,4 @@ mod tests {
        );
        Ok(())
    }
-
-    #[tokio::test]
-    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-
-        writer.write_buffered_borrowed(b"abc").await?;
-        writer.write_buffered_borrowed(b"d").await?;
-        writer.write_buffered_borrowed(b"e").await?;
-        writer.write_buffered_borrowed(b"fg").await?;
-        writer.write_buffered_borrowed(b"hi").await?;
-        writer.write_buffered_borrowed(b"j").await?;
-        writer.write_buffered_borrowed(b"klmno").await?;
-
-        let recorder = writer.flush_and_into_inner().await?;
-        assert_eq!(
-            recorder.writes,
-            {
-                let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
-                expect
-            }
-            .iter()
-            .map(|v| v[..].to_vec())
-            .collect::<Vec<_>>()
-        );
-        Ok(())
-    }
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1034,7 +1034,7 @@ impl WalIngest {

            let nblocks = modification
                .tline
-                .get_rel_size(src_rel, Version::Modified(modification), ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
                .await?;
            let dst_rel = RelTag {
                spcnode: tablespace_id,
@@ -1068,7 +1068,13 @@ impl WalIngest {

                let content = modification
                    .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
+                    .get_rel_page_at_lsn(
+                        src_rel,
+                        blknum,
+                        Version::Modified(modification),
+                        true,
+                        ctx,
+                    )
                    .await?;
                modification.put_rel_page_image(dst_rel, blknum, content)?;
                num_blocks_copied += 1;
@@ -1236,7 +1242,7 @@ impl WalIngest {
                };
                if modification
                    .tline
-                    .get_rel_exists(rel, Version::Modified(modification), ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
                    .await?
                {
                    self.put_rel_drop(modification, rel, ctx).await?;
@@ -1535,7 +1541,7 @@ impl WalIngest {
            nblocks
        } else if !modification
            .tline
-            .get_rel_exists(rel, Version::Modified(modification), ctx)
+            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
@@ -1547,7 +1553,7 @@ impl WalIngest {
        } else {
            modification
                .tline
-                .get_rel_size(rel, Version::Modified(modification), ctx)
+                .get_rel_size(rel, Version::Modified(modification), true, ctx)
                .await?
        };

@@ -1644,14 +1650,14 @@ async fn get_relsize(
 ) -> anyhow::Result<BlockNumber> {
    let nblocks = if !modification
        .tline
-        .get_rel_exists(rel, Version::Modified(modification), ctx)
+        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
        .await?
    {
        0
    } else {
        modification
            .tline
-            .get_rel_size(rel, Version::Modified(modification), ctx)
+            .get_rel_size(rel, Version::Modified(modification), true, ctx)
            .await?
    };
    Ok(nblocks)
@@ -1726,29 +1732,29 @@ mod tests {
        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
            .await
            .is_err());
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            1
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            3
        );
@@ -1756,46 +1762,46 @@ mod tests {
        // Check page contents at each LSN
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 2")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            test_img("foo blk 1 at 4")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 1 at 4")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 2 at 5")
        );
@@ -1811,19 +1817,19 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            test_img("foo blk 1 at 4")
        );
@@ -1831,13 +1837,13 @@ mod tests {
        // should still see the truncated block with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            3
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 2 at 5")
        );
@@ -1850,7 +1856,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
                .await?,
            0
        );
@@ -1863,19 +1869,19 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
            ZERO_PAGE
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
            test_img("foo blk 1")
        );
@@ -1888,21 +1894,21 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            1501
        );
        for blk in 2..1500 {
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
                    .await?,
                ZERO_PAGE
            );
        }
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            test_img("foo blk 1500")
        );
@@ -1929,13 +1935,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            1
        );
@@ -1948,7 +1954,7 @@ mod tests {
        // Check that rel is not visible anymore
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
                .await?,
            false
        );
@@ -1966,13 +1972,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            1
        );
@@ -2005,24 +2011,24 @@ mod tests {
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
            .await
            .is_err());

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            relsize
        );
@@ -2033,7 +2039,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2050,7 +2056,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            1
        );
@@ -2060,7 +2066,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2069,7 +2075,7 @@ mod tests {
        // should still see all blocks with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            relsize
        );
@@ -2078,7 +2084,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2098,13 +2104,13 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            relsize
        );
@@ -2114,7 +2120,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2148,7 +2154,7 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                .await?,
            RELSEG_SIZE + 1
        );
@@ -2162,7 +2168,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                .await?,
            RELSEG_SIZE
        );
@@ -2177,7 +2183,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                .await?,
            RELSEG_SIZE - 1
        );
@@ -2195,7 +2201,7 @@ mod tests {
            m.commit(&ctx).await?;
            assert_eq!(
                tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                    .await?,
                size as BlockNumber
            );
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,8 +49,6 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int         neon_protocol_version = 1;
-
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
@@ -381,17 +379,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		pfree(msg);
 		return false;
 	}
-	switch (neon_protocol_version)
-	{
-		case 2:
-			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
-			break;
-		case 1:
-			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-			break;
-		default:
-			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-	}
+	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 	ret = PQsendQuery(conn, query);
 	pfree(query);
 	if (ret != 1)
@@ -452,7 +440,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		return false;
 	}

-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;

@@ -856,16 +844,6 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
-	DefineCustomIntVariable("neon.protocol_version",
-							"Version of compute<->page server protocol",
-							NULL,
-							&neon_protocol_version,
-							1, /* default to old protocol for now */
-							1, /* min */
-							2, /* max */
-							PGC_SU_BACKEND,
-							0,	/* no flags required */
-							NULL, NULL, NULL);

 	relsize_hash_init();

--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -69,33 +69,18 @@ typedef enum {
 	SLRU_MULTIXACT_OFFSETS
 } SlruKind;

-/*--
- * supertype of all the Neon*Request structs below.
+/*
+ * supertype of all the Neon*Request structs below
 *
- * All requests contain two LSNs:
- *
- * lsn:                request page (or relation size, etc) at this LSN
- * not_modified_since: Hint that the page hasn't been modified between
- *                     this LSN and the request LSN (`lsn`).
- *
- * To request the latest version of a page, you can use MAX_LSN as the request
- * LSN.
- *
- * If you don't know any better, you can always set 'not_modified_since' equal
- * to 'lsn', but providing a lower value can speed up processing the request
- * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it
- * can skip traversing through recent layers which we know to not contain any
- * versions for the requested page.
- *
- * These structs describe the V2 of these requests. The old V1 protocol contained
- * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
- * set to 1, we will convert these to the V1 requests before sending.
+ * If 'latest' is true, we are requesting the latest page version, and 'lsn'
+ * is just a hint to the server that we know there are no versions of the page
+ * (or relation size, for exists/nblocks requests) later than the 'lsn'.
 */
 typedef struct
 {
 	NeonMessageTag tag;
-	XLogRecPtr	lsn;
-	XLogRecPtr	not_modified_since;
+	bool		latest;			/* if true, request latest page version */
+	XLogRecPtr	lsn;			/* request page version @ this LSN */
 } NeonRequest;

 typedef struct
@@ -208,7 +193,6 @@ extern int	readahead_buffer_size;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
-extern int  neon_protocol_version;

 extern shardno_t get_shard_number(BufferTag* tag);

@@ -241,14 +225,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -168,8 +168,8 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	XLogRecPtr	effective_request_lsn;
+	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -269,19 +269,19 @@ static PrefetchState *MyPState;
 	) \
 )

+static XLogRecPtr prefetch_lsn = 0;
+
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);

-static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
-static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
-										  PrefetchRequest *slot);
+static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
+									   ForkNumber forknum, BlockNumber blkno);

 static bool
 compact_prefetch_buffers(void)
@@ -338,8 +338,8 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->request_lsn = source_slot->request_lsn;
-		target_slot->not_modified_since = source_slot->not_modified_since;
+		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
+		target_slot->actual_request_lsn = source_slot->actual_request_lsn;
 		target_slot->my_ring_index = empty_ring_index;

 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,8 +358,7 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->request_lsn = InvalidXLogRecPtr;
-		source_slot->not_modified_since = InvalidXLogRecPtr;
+		source_slot->effective_request_lsn = 0;

 		/* update bookkeeping */
 		n_moved++;
@@ -684,39 +683,56 @@ prefetch_set_unused(uint64 ring_index)
 		compact_prefetch_buffers();
 }

-/*
- * Send one prefetch request to the pageserver. To wait for the response, call
- * prefetch_wait_for().
- */
 static void
-prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
+prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
 {
 	bool		found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
-		/* lsn and not_modified_since are filled in below */
+		.req.latest = false,
+		.req.lsn = 0,
 		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};

-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
-	if (force_request_lsn)
+	if (force_lsn && force_latest)
 	{
-		request.req.lsn = *force_request_lsn;
-		request.req.not_modified_since = *force_not_modified_since;
+		request.req.lsn = *force_lsn;
+		request.req.latest = *force_latest;
+		slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn;
 	}
 	else
 	{
-		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
-							 slot->buftag.forkNum,
-							 slot->buftag.blockNum,
-							 &request.req.lsn,
-							 &request.req.not_modified_since);
+		XLogRecPtr	lsn = neon_get_request_lsn(
+											   &request.req.latest,
+											   BufTagGetNRelFileInfo(slot->buftag),
+											   slot->buftag.forkNum,
+											   slot->buftag.blockNum
+			);
+
+		/*
+		 * Note: effective_request_lsn is potentially higher than the
+		 * requested LSN, but still correct:
+		 *
+		 * We know there are no changes between the actual requested LSN and
+		 * the value of effective_request_lsn: If there were, the page would
+		 * have been in cache and evicted between those LSN values, which then
+		 * would have had to result in a larger request LSN for this page.
+		 *
+		 * It is possible that a concurrent backend loads the page, modifies
+		 * it and then evicts it again, but the LSN of that eviction cannot be
+		 * smaller than the current WAL insert/redo pointer, which is already
+		 * larger than this prefetch_lsn. So in any case, that would
+		 * invalidate this cache.
+		 *
+		 * The best LSN to use for effective_request_lsn would be
+		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
+		 */
+		slot->actual_request_lsn = request.req.lsn = lsn;
+		prefetch_lsn = Max(prefetch_lsn, lsn);
+		slot->effective_request_lsn = prefetch_lsn;
 	}
-	slot->request_lsn = request.req.lsn;
-	slot->not_modified_since = request.req.not_modified_since;

 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -733,6 +749,7 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;

+
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -742,25 +759,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
 *
 * Register that we may want the contents of BufferTag in the near future.
 *
- * If force_request_lsn and force_not_modified_since are not NULL, those
- * values are sent to the pageserver. If they are NULL, we utilize the
- * lastWrittenLsn -infrastructure to fill them in.
+ * If force_latest and force_lsn are not NULL, those values are sent to the
+ * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
+ * to fill in these values manually.
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */

 static uint64
-prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
-						 XLogRecPtr *force_not_modified_since)
+prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;

-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -778,19 +792,40 @@ Retry:
 		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));

 		/*
-		 * If the caller specified a request LSN to use, only accept prefetch
-		 * responses that satisfy that request.
+		 * If we want a specific lsn, we do not accept requests that were made
+		 * with a potentially different LSN.
 		 */
-		if (force_request_lsn)
+		if (force_latest && force_lsn)
 		{
-			if (!neon_prefetch_response_usable(*force_request_lsn,
-											   *force_not_modified_since, slot))
+			/*
+			 * if we want the latest version, any effective_request_lsn <
+			 * request lsn is OK
+			 */
+			if (*force_latest)
 			{
-				/* Wait for the old request to finish and discard it */
-				if (!prefetch_wait_for(ring_index))
-					goto Retry;
-				prefetch_set_unused(ring_index);
-				entry = NULL;
+				if (*force_lsn > slot->effective_request_lsn)
+				{
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
+
+			}
+
+			/*
+			 * if we don't want the latest version, only accept requests with
+			 * the exact same LSN
+			 */
+			else
+			{
+				if (*force_lsn != slot->effective_request_lsn)
+				{
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+				}
 			}
 		}

@@ -886,7 +921,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;

-	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
+	prefetch_do_request(slot, force_latest, force_lsn);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -915,7 +950,7 @@ page_server_request(void const *req)
 	BufferTag tag = {0};
 	shardno_t shard_no;

-	switch (messageTag(req))
+	switch (((NeonRequest *) req)->tag)
 	{
 		case T_NeonExistsRequest:
 			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
@@ -931,10 +966,11 @@ page_server_request(void const *req)
 			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
 			break;
 		default:
-			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
+			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
 	}
 	shard_no = get_shard_number(&tag);

+
 	/*
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
@@ -961,52 +997,8 @@ nm_pack_request(NeonRequest *msg)
 	StringInfoData s;

 	initStringInfo(&s);
+	pq_sendbyte(&s, msg->tag);

-	if (neon_protocol_version >= 2)
-	{
-		pq_sendbyte(&s, msg->tag);
-		pq_sendint64(&s, msg->lsn);
-		pq_sendint64(&s, msg->not_modified_since);
-	}
-	else
-	{
-		bool		latest;
-		XLogRecPtr	lsn;
-
-		/*
-		 * In primary, we always request the latest page version.
-		 */
-		if (!RecoveryInProgress())
-		{
-			latest = true;
-			lsn = msg->not_modified_since;
-		}
-		else
-		{
-			/*
-			 * In the protocol V1, we cannot represent that we want to read
-			 * page at LSN X, and we know that it hasn't been modified since
-			 * Y. We can either use 'not_modified_lsn' as the request LSN, and
-			 * risk getting an error if that LSN is too old and has already
-			 * fallen out of the pageserver's GC horizon, or we can send
-			 * 'request_lsn', causing the pageserver to possibly wait for the
-			 * recent WAL to arrive unnecessarily. Or something in between. We
-			 * choose to use the old LSN and risk GC errors, because that's
-			 * what we've done historically.
-			 */
-			latest = false;
-			lsn = msg->not_modified_since;
-		}
-
-		pq_sendbyte(&s, msg->tag);
-		pq_sendbyte(&s, latest);
-		pq_sendint64(&s, lsn);
-	}
-
-	/*
-	 * The rest of the request messages are the same between protocol V1 and
-	 * V2
-	 */
 	switch (messageTag(msg))
 	{
 			/* pagestore_client -> pagestore */
@@ -1014,6 +1006,8 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;

+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1025,6 +1019,8 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;

+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1036,6 +1032,8 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;

+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, msg_req->dbNode);

 				break;
@@ -1044,6 +1042,8 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;

+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1057,6 +1057,8 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;

+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendbyte(&s, msg_req->kind);
 				pq_sendint32(&s, msg_req->segno);

@@ -1207,7 +1209,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1220,7 +1222,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1234,7 +1236,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1245,7 +1247,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
 				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1257,7 +1259,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
 				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1529,38 +1531,44 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
 * Return LSN for requesting pages and number of blocks from page server
 */
-static void
-neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
+static XLogRecPtr
+neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 {
-	XLogRecPtr	last_written_lsn;
-
-	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-	last_written_lsn = nm_adjust_lsn(last_written_lsn);
-	Assert(last_written_lsn != InvalidXLogRecPtr);
+	XLogRecPtr	lsn;

 	if (RecoveryInProgress())
 	{
-		/* Request the page at the last replayed LSN. */
-		*request_lsn = GetXLogReplayRecPtr(NULL);
-		*not_modified_since = last_written_lsn;
-		Assert(last_written_lsn <= *request_lsn);
+		/*
+		 * We don't know if WAL has been generated but not yet replayed, so
+		 * we're conservative in our estimates about latest pages.
+		 */
+		*latest = false;

-		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
+		/*
+		 * Get the last written LSN of this page.
+		 */
+		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
+		lsn = nm_adjust_lsn(lsn);
+
+		neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
 	else
 	{
 		XLogRecPtr	flushlsn;

 		/*
-		 * Use the latest LSN that was evicted from the buffer cache as the
-		 * 'not_modified_since' hint. Any pages modified by later WAL records
-		 * must still in the buffer cache, so our request cannot concern
-		 * those.
+		 * Use the latest LSN that was evicted from the buffer cache. Any
+		 * pages modified by later WAL records must still in the buffer cache,
+		 * so our request cannot concern those.
 		 */
+		*latest = true;
+		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
+		Assert(lsn != InvalidXLogRecPtr);
 		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
-				 LSN_FORMAT_ARGS(last_written_lsn));
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
+
+		lsn = nm_adjust_lsn(lsn);

 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush
@@ -1575,109 +1583,16 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 #else
 		flushlsn = GetFlushRecPtr();
 #endif
-		if (last_written_lsn > flushlsn)
+		if (lsn > flushlsn)
 		{
 			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
-					 LSN_FORMAT_ARGS(last_written_lsn),
-					 LSN_FORMAT_ARGS(flushlsn));
-			XLogFlush(last_written_lsn);
-			flushlsn = last_written_lsn;
+				 (uint32) (lsn >> 32), (uint32) lsn,
+				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
+			XLogFlush(lsn);
 		}
-
-		/*
-		 * Request the latest version of the page. The most up-to-date request
-		 * LSN we could use would be the current insert LSN, but to avoid the
-		 * overhead of looking it up, use 'flushlsn' instead. This relies on
-		 * the assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we wouldn't be
-		 * requesting it.
-		 */
-		*request_lsn = flushlsn;
-		*not_modified_since = last_written_lsn;
-	}
-}
-
-/*
- *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
- *
- * This is used to check if the response to a prefetch request can be used to
- * satisfy a page read now.
- */
-static bool
-neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
-							  PrefetchRequest *slot)
-{
-	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsn >= not_modified_since);
-	Assert(slot->request_lsn >= slot->not_modified_since);
-	Assert(slot->status != PRFS_UNUSED);
-
-	/*
-	 * The new request's LSN should never be older than the old one.  This
-	 * could be an Assert, except that for testing purposes, we do provide an
-	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
-	 * violates this.
-	 *
-	 * Similarly, the not_modified_since value calculated for a page should
-	 * never move backwards. This assumption is a bit fragile; if we updated
-	 * the last-written cache when we read in a page, for example, then it
-	 * might. But as the code stands, it should not.
-	 *
-	 * (If two backends issue a request at the same time, they might race and
-	 * calculate LSNs "out of order" with each other, but the prefetch queue
-	 * is backend-private at the moment.)
-	 */
-	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
-	{
-		ereport(LOG,
-				(errcode(ERRCODE_IO_ERROR),
-				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
-				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
-		return false;
 	}

-	/*---
-	 * Each request to the pageserver carries two LSN values:
-	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
-	 * request_lsn] range of each request is effectively a claim that the page
-	 * has not been modified between those LSNs.  If the range of the old
-	 * request in the queue overlaps with the new request, we know that the
-	 * page hasn't been modified in the union of the ranges. We can use the
-	 * response to old request to satisfy the new request in that case. For
-	 * example:
-	 *
-	 *              100      500
-	 * Old request:  +--------+
-	 *
-	 *                     400      800
-	 * New request:         +--------+
-	 *
-	 * The old request claims that the page was not modified between LSNs 100
-	 * and 500, and the second claims that it was not modified between 400 and
-	 * 800. Together they mean that the page was not modified between 100 and
-	 * 800. Therefore the response to the old request is also valid for the
-	 * new request.
-	 *
-	 * This logic also holds at the boundary case that the old request's LSN
-	 * matches the new request's not_modified_since LSN exactly:
-	 *
-	 *              100      500
-	 * Old request:  +--------+
-	 *
-	 *                       500      900
-	 * New request:           +--------+
-	 *
-	 * The response to the old request is the page as it was at LSN 500, and
-	 * the page hasn't been changed in the range (500, 900], therefore the
-	 * response is valid also for the new request.
-	 */
-
-	/* this follows from the checks above */
-	Assert(request_lsn >= slot->not_modified_since);
-
-	return not_modified_since <= slot->request_lsn;
+	return lsn;
 }

 /*
@@ -1689,8 +1604,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
+	bool		latest;
 	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1745,13 +1660,12 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}

-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
+			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 		.forknum = forkNum};

@@ -2188,10 +2102,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
+				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
+				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2234,16 +2148,15 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
+		if (slot->effective_request_lsn >= request_lsn)
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else
+		else					/* the current prefetch LSN is not large
+								 * enough, so drop the prefetch */
 		{
 			/*
-			 * Cannot use this prefetch, discard it
-			 *
 			 * We can't drop cache for not-yet-received requested items. It is
 			 * unlikely this happens, but it can happen if prefetch distance
 			 * is large enough and a backend didn't consume all prefetch
@@ -2268,8 +2181,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;

-			ring_index = prefetch_register_buffer(buftag, &request_lsn,
-												  &not_modified_since);
+			ring_index = prefetch_register_buffer(buftag, &request_latest,
+												  &request_lsn);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2333,8 +2246,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
+	bool		latest;
 	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2359,9 +2272,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}

-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
-						 &request_lsn, &not_modified_since);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
+	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2530,8 +2442,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
+	bool		latest;
 	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2558,13 +2470,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}

-	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
+			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2612,17 +2523,16 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn,
-				not_modified_since;
+	XLogRecPtr	request_lsn;
+	bool		latest;
 	NRelFileInfo dummy_node = {0};

-	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
+			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
 			.dbNode = dbNode,
 		};

@@ -2695,6 +2605,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 * the most recently inserted WAL record's LSN.
 	 */
 	lsn = GetXLogInsertRecPtr();
+
 	lsn = nm_adjust_lsn(lsn);

 	/*
@@ -2894,33 +2805,14 @@ neon_end_unlogged_build(SMgrRelation reln)
 static int
 neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
 {
-	XLogRecPtr request_lsn,
-		not_modified_since;
-
-	if (RecoveryInProgress())
-	{
-		request_lsn = GetXLogReplayRecPtr(NULL);
-		if (request_lsn == InvalidXLogRecPtr)
-		{
-			/*
-			 * This happens in neon startup, we start up without replaying any
-			 * records.
-			 */
-			request_lsn = GetRedoStartLsn();
-		}
-	}
-	else
-		request_lsn = GetXLogInsertRecPtr();
-	request_lsn = nm_adjust_lsn(request_lsn);
-
+	XLogRecPtr request_lsn;
 	/*
-	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
-	 * segment has not changed since the basebackup, because in order to
-	 * modify it, we would have had to download it already. And once
-	 * downloaded, we never evict SLRU segments from local disk.
+	 * GetRedoStartLsn() returns LSN of basebackup.
+	 * We need to download SLRU segments only once after node startup,
+	 * then SLRUs are maintained locally.
 	 */
-	not_modified_since = GetRedoStartLsn();
-
+	request_lsn = GetRedoStartLsn();
+	request_lsn = nm_adjust_lsn(request_lsn);
 	SlruKind kind;

    if (STRPREFIX(path, "pg_xact"))
@@ -2935,8 +2827,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	NeonResponse *resp;
 	NeonGetSlruSegmentRequest request = {
 		.req.tag = T_NeonGetSlruSegmentRequest,
+		.req.latest = false,
 		.req.lsn = request_lsn,
-		.req.not_modified_since = not_modified_since,

 		.kind = kind,
 		.segno = segno
@@ -3064,9 +2956,6 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 {
 	BlockNumber relsize;

-	/* This is only used in WAL replay */
-	Assert(RecoveryInProgress());
-
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
@@ -3085,13 +2974,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * This length is later reused when we open the smgr to read the
 		 * block, which is fine and expected.
 		 */
+
 		NeonResponse *response;
 		NeonNblocksResponse *nbresponse;
 		NeonNblocksRequest request = {
 			.req = (NeonRequest) {
-				.tag = T_NeonNblocksRequest,
 				.lsn = end_recptr,
-				.not_modified_since = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
 			},
 			.rinfo = rinfo,
 			.forknum = forknum,
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o

 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.1.sql
+DATA = neon_test_utils--1.0.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"

 PG_CONFIG = pg_config
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
@@ -31,12 +31,12 @@ AS 'MODULE_PATHNAME', 'clear_buffer_cache'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;

-CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
 LANGUAGE C PARALLEL UNSAFE;

-CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.1'
+default_version = '1.0'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
 */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+									   XLogRecPtr request_lsn, bool request_latest, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+									   XLogRecPtr request_lsn, bool request_latest, void *buffer);
 #endif

 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -299,11 +299,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *forkname;
 	uint32		blkno;

-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
-
-	if (PG_NARGS() != 5)
-		elog(ERROR, "unexpected number of arguments in SQL function signature");
+	bool		request_latest = PG_ARGISNULL(3);
+	uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);

 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
 		PG_RETURN_NULL();
@@ -312,9 +309,6 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);

-	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
-	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
-
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -367,7 +361,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);

-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data);

 	relation_close(rel, AccessShareLock);

@@ -386,9 +380,6 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 {
 	char	   *raw_page_data;

-	if (PG_NARGS() != 7)
-		elog(ERROR, "unexpected number of arguments in SQL function signature");
-
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -412,20 +403,18 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		};

 		ForkNumber	forknum = PG_GETARG_UINT32(3);
+
 		uint32		blkno = PG_GETARG_UINT32(4);
-		XLogRecPtr	request_lsn;
-		XLogRecPtr	not_modified_since;
+		bool		request_latest = PG_ARGISNULL(5);
+		uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);

 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);

-		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
-		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
-
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);

-		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -279,7 +279,7 @@ async fn handle_client(

    // doesn't yet matter as pg-sni-router doesn't report analytics logs
    ctx.set_success();
-    ctx.log_connect();
+    ctx.log();

    // Starting from here we only proxy the client's traffic.
    info!("performing the proxy pass...");
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -260,9 +260,7 @@ impl ConnCfg {
        aux: MetricsAuxInfo,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
-        drop(pause);

        let tls_connector = native_tls::TlsConnector::builder()
            .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -272,9 +270,7 @@ impl ConnCfg {
        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;

        // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (client, connection) = self.0.connect_raw(stream, tls).await?;
-        drop(pause);
        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
        let stream = connection.stream.into_inner();

--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -20,8 +20,7 @@ use self::parquet::RequestData;

 pub mod parquet;

-pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
-pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();

 /// Context data for a single request to connect to a database.
 ///
@@ -50,12 +49,9 @@ pub struct RequestMonitoring {
    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
    sender: Option<mpsc::UnboundedSender<RequestData>>,
-    // This sender is only used to log the length of session in case of success.
-    disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
    pub latency_timer: LatencyTimer,
    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
    rejected: Option<bool>,
-    disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }

 #[derive(Clone, Debug)]
@@ -104,9 +100,7 @@ impl RequestMonitoring {
            cold_start_info: ColdStartInfo::Unknown,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
-            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
            latency_timer: LatencyTimer::new(protocol),
-            disconnect_timestamp: None,
        }
    }

@@ -196,7 +190,11 @@ impl RequestMonitoring {
        self.success = true;
    }

-    pub fn log_connect(&mut self) {
+    pub fn log(self) {}
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
        let outcome = if self.success {
            ConnectOutcome::Success
        } else {
@@ -228,23 +226,4 @@ impl RequestMonitoring {
            let _: Result<(), _> = tx.send(RequestData::from(&*self));
        }
    }
-
-    fn log_disconnect(&mut self) {
-        // If we are here, it's guaranteed that the user successfully connected to the endpoint.
-        // Here we log the length of the session.
-        self.disconnect_timestamp = Some(Utc::now());
-        if let Some(tx) = self.disconnect_sender.take() {
-            let _: Result<(), _> = tx.send(RequestData::from(&*self));
-        }
-    }
-}
-
-impl Drop for RequestMonitoring {
-    fn drop(&mut self) {
-        if self.sender.is_some() {
-            self.log_connect();
-        } else {
-            self.log_disconnect();
-        }
-    }
 }
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -19,10 +19,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;

-use crate::{
-    config::{remote_storage_from_toml, OptRemoteStorageConfig},
-    context::LOG_CHAN_DISCONNECT,
-};
+use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};

 use super::{RequestMonitoring, LOG_CHAN};

@@ -34,9 +31,6 @@ pub struct ParquetUploadArgs {
    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
    parquet_upload_remote_storage: OptRemoteStorageConfig,

-    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
-    parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
-
    /// How many rows to include in a row group
    #[clap(long, default_value_t = 8192)]
    parquet_upload_row_group_size: usize,
@@ -97,8 +91,6 @@ pub struct RequestData {
    /// Tracks time from session start (HTTP request/libpq TCP handshake)
    /// Through to success/failure
    duration_us: u64,
-    /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`.
-    disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }

 impl From<&RequestMonitoring> for RequestData {
@@ -128,7 +120,6 @@ impl From<&RequestMonitoring> for RequestData {
                .elapsed()
                .unwrap_or_default()
                .as_micros() as u64, // 584 millenia... good enough
-            disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()),
        }
    }
 }
@@ -150,9 +141,8 @@ pub async fn worker(
    LOG_CHAN.set(tx.downgrade()).unwrap();

    // setup row stream that will close on cancellation
-    let cancellation_token2 = cancellation_token.clone();
    tokio::spawn(async move {
-        cancellation_token2.cancelled().await;
+        cancellation_token.cancelled().await;
        // dropping this sender will cause the channel to close only once
        // all the remaining inflight requests have been completed.
        drop(tx);
@@ -177,38 +167,9 @@ pub async fn worker(
        test_remote_failures: 0,
    };

-    // TODO(anna): consider moving this to a separate function.
-    if let Some(disconnect_events_storage_config) =
-        config.parquet_upload_disconnect_events_remote_storage
-    {
-        let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel();
-        LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap();
-
-        // setup row stream that will close on cancellation
-        tokio::spawn(async move {
-            cancellation_token.cancelled().await;
-            // dropping this sender will cause the channel to close only once
-            // all the remaining inflight requests have been completed.
-            drop(tx_disconnect);
-        });
-        let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
-        let rx_disconnect = rx_disconnect.map(RequestData::from);
-
-        let storage_disconnect =
-            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
-                .context("remote storage for disconnect events init")?;
-        let parquet_config_disconnect = parquet_config.clone();
-        tokio::try_join!(
-            worker_inner(storage, rx, parquet_config),
-            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
-        )
-        .map(|_| ())
-    } else {
-        worker_inner(storage, rx, parquet_config).await
-    }
+    worker_inner(storage, rx, parquet_config).await
 }

-#[derive(Clone, Debug)]
 struct ParquetConfig {
    propeties: WriterPropertiesPtr,
    rows_per_group: usize,
@@ -491,7 +452,6 @@ mod tests {
            success: rng.gen(),
            cold_start_info: "no",
            duration_us: rng.gen_range(0..30_000_000),
-            disconnect_timestamp: None,
        }
    }

@@ -560,15 +520,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1314385, 3, 6000),
+                (1314378, 3, 6000),
+                (1314438, 3, 6000),
+                (1314395, 3, 6000),
+                (1314525, 3, 6000),
+                (1314367, 3, 6000),
+                (1314159, 3, 6000),
+                (1314395, 3, 6000),
+                (438352, 1, 2000)
            ]
        );

@@ -598,11 +558,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1221738, 5, 10000),
-                (1227888, 5, 10000),
-                (1229682, 5, 10000),
-                (1229044, 5, 10000),
-                (1220322, 5, 10000)
+                (1220633, 5, 10000),
+                (1226783, 5, 10000),
+                (1228577, 5, 10000),
+                (1227939, 5, 10000),
+                (1219217, 5, 10000)
            ]
        );

@@ -634,11 +594,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1207385, 5, 10000),
-                (1207116, 5, 10000),
-                (1207409, 5, 10000),
-                (1207397, 5, 10000),
-                (1207652, 5, 10000)
+                (1206280, 5, 10000),
+                (1206011, 5, 10000),
+                (1206304, 5, 10000),
+                (1206292, 5, 10000),
+                (1206547, 5, 10000)
            ]
        );

@@ -663,15 +623,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1314385, 3, 6000),
+                (1314378, 3, 6000),
+                (1314438, 3, 6000),
+                (1314395, 3, 6000),
+                (1314525, 3, 6000),
+                (1314367, 3, 6000),
+                (1314159, 3, 6000),
+                (1314395, 3, 6000),
+                (438352, 1, 2000)
            ]
        );

@@ -708,7 +668,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
+            [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
        );

        tmpdir.close().unwrap();
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -284,8 +284,6 @@ pub struct ComputeConnectionLatencyGroup {
 pub enum LatencyExclusions {
    Client,
    ClientAndCplane,
-    ClientCplaneCompute,
-    ClientCplaneComputeRetry,
 }

 #[derive(FixedCardinalityLabel, Copy, Clone)]
@@ -354,7 +352,6 @@ pub enum Waiting {
    Cplane,
    Client,
    Compute,
-    RetryTimeout,
 }

 #[derive(Default)]
@@ -362,7 +359,6 @@ struct Accumulated {
    cplane: time::Duration,
    client: time::Duration,
    compute: time::Duration,
-    retry: time::Duration,
 }

 pub struct LatencyTimer {
@@ -425,7 +421,6 @@ impl Drop for LatencyTimerPause<'_> {
            Waiting::Cplane => self.timer.accumulated.cplane += dur,
            Waiting::Client => self.timer.accumulated.client += dur,
            Waiting::Compute => self.timer.accumulated.compute += dur,
-            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
        }
    }
 }
@@ -469,34 +464,6 @@ impl Drop for LatencyTimer {
            },
            duration.saturating_sub(accumulated_total).as_secs_f64(),
        );
-
-        // Exclude client cplane, compue communication from the accumulated time.
-        let accumulated_total =
-            self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
-        metric.observe(
-            ComputeConnectionLatencyGroup {
-                protocol: self.protocol,
-                cold_start_info: self.cold_start_info,
-                outcome: self.outcome,
-                excluded: LatencyExclusions::ClientCplaneCompute,
-            },
-            duration.saturating_sub(accumulated_total).as_secs_f64(),
-        );
-
-        // Exclude client cplane, compue, retry communication from the accumulated time.
-        let accumulated_total = self.accumulated.client
-            + self.accumulated.cplane
-            + self.accumulated.compute
-            + self.accumulated.retry;
-        metric.observe(
-            ComputeConnectionLatencyGroup {
-                protocol: self.protocol,
-                cold_start_info: self.cold_start_info,
-                outcome: self.outcome,
-                excluded: LatencyExclusions::ClientCplaneComputeRetry,
-            },
-            duration.saturating_sub(accumulated_total).as_secs_f64(),
-        );
    }
 }

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -132,14 +132,16 @@ pub async fn task_main(
                Err(e) => {
                    // todo: log and push to ctx the error kind
                    ctx.set_error_kind(e.get_error_kind());
+                    ctx.log();
                    error!(parent: &span, "per-client task finished with an error: {e:#}");
                }
                Ok(None) => {
                    ctx.set_success();
+                    ctx.log();
                }
                Ok(Some(p)) => {
                    ctx.set_success();
-                    ctx.log_connect();
+                    ctx.log();
                    match p.proxy_pass().instrument(span.clone()).await {
                        Ok(()) => {}
                        Err(e) => {
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -133,17 +133,10 @@ where

    error!(error = ?err, "could not connect to compute node");

-    let node_info = if !node_info.cached() || !err.should_retry_database_address() {
+    let node_info = if !node_info.cached() {
        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
        // Do not need to retrieve a new node_info, just return the old one.
        if !err.should_retry(num_retries, connect_to_compute_retry_config) {
-            Metrics::get().proxy.retries_metric.observe(
-                RetriesMetricGroup {
-                    outcome: ConnectOutcome::Failed,
-                    retry_type,
-                },
-                num_retries.into(),
-            );
            return Err(err.into());
        }
        node_info
@@ -201,10 +194,6 @@ where
        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
        num_retries += 1;

-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
        time::sleep(wait_duration).await;
-        drop(pause);
    }
 }
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -10,9 +10,6 @@ pub trait ShouldRetry {
            err => err.could_retry(),
        }
    }
-    fn should_retry_database_address(&self) -> bool {
-        true
-    }
 }

 impl ShouldRetry for io::Error {
@@ -36,21 +33,6 @@ impl ShouldRetry for tokio_postgres::error::DbError {
                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
        )
    }
-    fn should_retry_database_address(&self) -> bool {
-        use tokio_postgres::error::SqlState;
-        // Here are errors that happens after the user successfully authenticated to the database.
-        // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
-        !matches!(
-            self.code(),
-            &SqlState::TOO_MANY_CONNECTIONS
-                | &SqlState::OUT_OF_MEMORY
-                | &SqlState::SYNTAX_ERROR
-                | &SqlState::T_R_SERIALIZATION_FAILURE
-                | &SqlState::INVALID_CATALOG_NAME
-                | &SqlState::INVALID_SCHEMA_NAME
-                | &SqlState::INVALID_PARAMETER_VALUE
-        )
-    }
 }

 impl ShouldRetry for tokio_postgres::Error {
@@ -63,15 +45,6 @@ impl ShouldRetry for tokio_postgres::Error {
            false
        }
    }
-    fn should_retry_database_address(&self) -> bool {
-        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
-            io::Error::should_retry_database_address(io_err)
-        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::should_retry_database_address(db_err)
-        } else {
-            true
-        }
-    }
 }

 impl ShouldRetry for compute::ConnectionError {
@@ -82,13 +55,6 @@ impl ShouldRetry for compute::ConnectionError {
            _ => false,
        }
    }
-    fn should_retry_database_address(&self) -> bool {
-        match self {
-            compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
-            compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
-            _ => true,
-        }
-    }
 }

 pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -54,11 +54,7 @@ pub async fn wake_compute<B: ComputeConnectBackend>(

        let wait_duration = retry_after(*num_retries, config);
        *num_retries += 1;
-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
        tokio::time::sleep(wait_duration).await;
-        drop(pause);
    }
 }

--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -16,7 +16,7 @@ use crate::{
    proxy::connect_compute::ConnectMechanism,
 };

-use super::conn_pool::{poll_tokio_client, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};

 pub struct PoolingBackend {
    pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -179,12 +179,10 @@ impl ConnectMechanism for TokioMechanism {
            .dbname(&self.conn_info.dbname)
            .connect_timeout(timeout);

-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
-        drop(pause);

        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
-        Ok(poll_tokio_client(
+        Ok(poll_client(
            self.pool.clone(),
            ctx,
            self.conn_info.clone(),
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,11 +1,9 @@
 use dashmap::DashMap;
-use futures::Future;
+use futures::{future::poll_fn, Future};
 use parking_lot::RwLock;
-use pin_project_lite::pin_project;
 use rand::Rng;
 use smallvec::SmallVec;
-use std::sync::Weak;
-use std::{collections::HashMap, sync::Arc, time::Duration};
+use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
    fmt,
    task::{ready, Poll},
@@ -14,13 +12,13 @@ use std::{
    ops::Deref,
    sync::atomic::{self, AtomicUsize},
 };
-use tokio::time::{Instant, Sleep};
+use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
-use tokio_util::sync::{CancellationToken, WaitForCancellationFutureOwned};
+use tokio_util::sync::CancellationToken;

 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::{HttpEndpointPoolsGuard, Metrics, NumDbConnectionsGuard};
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
@@ -93,7 +91,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
            ..
        } = self;
        pools.get_mut(&db_user).and_then(|pool_entries| {
-            pool_entries.get_conn_entry(total_conns, global_connections_count)
+            pool_entries.get_conn_entry(total_conns, global_connections_count.clone())
        })
    }

@@ -127,16 +125,19 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
        let conn_id = client.conn_id;

+        if client.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return;
+        }
+        let global_max_conn = pool.read().global_pool_size_max_conns;
+        if pool
+            .read()
+            .global_connections_count
+            .load(atomic::Ordering::Relaxed)
+            >= global_max_conn
        {
-            let pool = pool.read();
-            if pool
-                .global_connections_count
-                .load(atomic::Ordering::Relaxed)
-                >= pool.global_pool_size_max_conns
-            {
-                info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
-                return;
-            }
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
+            return;
        }

        // return connection to the pool
@@ -216,7 +217,7 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
    fn get_conn_entry(
        &mut self,
        conns: &mut usize,
-        global_connections_count: &AtomicUsize,
+        global_connections_count: Arc<AtomicUsize>,
    ) -> Option<ConnPoolEntry<C>> {
        let mut removed = self.clear_closed_clients(conns);
        let conn = self.conns.pop();
@@ -462,97 +463,109 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
    }
 }

-pub fn poll_tokio_client(
-    global_pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
-    ctx: &mut RequestMonitoring,
-    conn_info: ConnInfo,
-    client: tokio_postgres::Client,
-    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
-    conn_id: uuid::Uuid,
-    aux: MetricsAuxInfo,
-) -> Client<tokio_postgres::Client> {
-    let connection = std::future::poll_fn(move |cx| {
-        loop {
-            let message = ready!(connection.poll_message(cx));
-            match message {
-                Some(Ok(AsyncMessage::Notice(notice))) => {
-                    info!("notice: {}", notice);
-                }
-                Some(Ok(AsyncMessage::Notification(notif))) => {
-                    warn!(
-                        pid = notif.process_id(),
-                        channel = notif.channel(),
-                        "notification received"
-                    );
-                }
-                Some(Ok(_)) => {
-                    warn!("unknown message");
-                }
-                Some(Err(e)) => {
-                    error!("connection error: {}", e);
-                    break;
-                }
-                None => {
-                    info!("connection closed");
-                    break;
-                }
-            }
-        }
-        Poll::Ready(())
-    });
-    poll_client(
-        global_pool,
-        ctx,
-        conn_info,
-        client,
-        connection,
-        conn_id,
-        aux,
-    )
-}
-
-pub fn poll_client<C: ClientInnerExt, I: Future<Output = ()> + Send + 'static>(
+pub fn poll_client<C: ClientInnerExt>(
    global_pool: Arc<GlobalConnPool<C>>,
    ctx: &mut RequestMonitoring,
    conn_info: ConnInfo,
    client: C,
-    connection: I,
+    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
    conn_id: uuid::Uuid,
    aux: MetricsAuxInfo,
 ) -> Client<C> {
    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
-    let session_id = ctx.session_id;
-    let (tx, rx) = tokio::sync::watch::channel(session_id);
+    let mut session_id = ctx.session_id;
+    let (tx, mut rx) = tokio::sync::watch::channel(session_id);

    let span = info_span!(parent: None, "connection", %conn_id);
    let cold_start_info = ctx.cold_start_info;
-    let session_span = info_span!(parent: span.clone(), "", %session_id);
-    session_span.in_scope(|| {
-        info!(cold_start_info = cold_start_info.as_str(), %conn_info, "new connection");
+    span.in_scope(|| {
+        info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
    });
    let pool = match conn_info.endpoint_cache_key() {
        Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
        None => Weak::new(),
    };
+    let pool_clone = pool.clone();

+    let db_user = conn_info.db_and_user();
    let idle = global_pool.get_idle_timeout();
    let cancel = CancellationToken::new();
+    let cancelled = cancel.clone().cancelled_owned();

-    let db_conn = DbConnection {
-        cancelled: cancel.clone().cancelled_owned(),
-        idle_timeout: tokio::time::sleep(idle),
-        idle,
-        db_user: conn_info.db_and_user(),
-        pool: pool.clone(),
-        session_span,
-        session_rx: rx,
-        conn_gauge,
-        conn_id,
-        connection,
-    };
+    tokio::spawn(
+    async move {
+        let _conn_gauge = conn_gauge;
+        let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        let mut cancelled = pin!(cancelled);

-    tokio::spawn(db_conn.instrument(span));
+        poll_fn(move |cx| {
+            if cancelled.as_mut().poll(cx).is_ready() {
+                info!("connection dropped");
+                return Poll::Ready(())
+            }

+            match rx.has_changed() {
+                Ok(true) => {
+                    session_id = *rx.borrow_and_update();
+                    info!(%session_id, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+                Err(_) => {
+                    info!("connection dropped");
+                    return Poll::Ready(())
+                }
+                _ => {}
+            }
+
+            // 5 minute idle connection timeout
+            if idle_timeout.as_mut().poll(cx).is_ready() {
+                idle_timeout.as_mut().reset(Instant::now() + idle);
+                info!("connection idle");
+                if let Some(pool) = pool.clone().upgrade() {
+                    // remove client from pool - should close the connection if it's idle.
+                    // does nothing if the client is currently checked-out and in-use
+                    if pool.write().remove_client(db_user.clone(), conn_id) {
+                        info!("idle connection removed");
+                    }
+                }
+            }
+
+            loop {
+                let message = ready!(connection.poll_message(cx));
+
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session_id, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session_id, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session_id, "connection error: {}", e);
+                        break
+                    }
+                    None => {
+                        info!("connection closed");
+                        break
+                    }
+                }
+            }
+
+            // remove from connection pool
+            if let Some(pool) = pool.clone().upgrade() {
+                if pool.write().remove_client(db_user.clone(), conn_id) {
+                    info!("closed connection removed");
+                }
+            }
+
+            Poll::Ready(())
+        }).await;
+
+    }
+    .instrument(span));
    let inner = ClientInner {
        inner: client,
        session: tx,
@@ -560,94 +573,7 @@ pub fn poll_client<C: ClientInnerExt, I: Future<Output = ()> + Send + 'static>(
        aux,
        conn_id,
    };
-    Client::new(inner, conn_info, pool)
-}
-
-pin_project! {
-    struct DbConnection<C: ClientInnerExt, Inner> {
-        #[pin]
-        cancelled: WaitForCancellationFutureOwned,
-
-        #[pin]
-        idle_timeout: Sleep,
-        idle: tokio::time::Duration,
-
-        db_user: (DbName, RoleName),
-        pool: Weak<RwLock<EndpointConnPool<C>>>,
-
-        session_span: tracing::Span,
-        session_rx: tokio::sync::watch::Receiver<uuid::Uuid>,
-
-        conn_gauge: NumDbConnectionsGuard<'static>,
-        conn_id: uuid::Uuid,
-        #[pin]
-        connection: Inner,
-    }
-}
-
-impl<C: ClientInnerExt, I: Future<Output = ()>> Future for DbConnection<C, I> {
-    type Output = ();
-
-    fn poll(self: std::pin::Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<Self::Output> {
-        let mut this = self.project();
-        if this.cancelled.as_mut().poll(cx).is_ready() {
-            let _span = this.session_span.enter();
-            info!("connection dropped");
-            return Poll::Ready(());
-        }
-
-        match this.session_rx.has_changed() {
-            Ok(true) => {
-                let session_id = *this.session_rx.borrow_and_update();
-                *this.session_span = info_span!("", %session_id);
-                let _span = this.session_span.enter();
-                info!("changed session");
-                this.idle_timeout
-                    .as_mut()
-                    .reset(Instant::now() + *this.idle);
-            }
-            Err(_) => {
-                let _span = this.session_span.enter();
-                info!("connection dropped");
-                return Poll::Ready(());
-            }
-            _ => {}
-        }
-
-        let _span = this.session_span.enter();
-
-        // 5 minute idle connection timeout
-        if this.idle_timeout.as_mut().poll(cx).is_ready() {
-            this.idle_timeout
-                .as_mut()
-                .reset(Instant::now() + *this.idle);
-            info!("connection idle");
-            if let Some(pool) = this.pool.upgrade() {
-                // remove client from pool - should close the connection if it's idle.
-                // does nothing if the client is currently checked-out and in-use
-                if pool
-                    .write()
-                    .remove_client(this.db_user.clone(), *this.conn_id)
-                {
-                    info!("idle connection removed");
-                }
-            }
-        }
-
-        ready!(this.connection.poll(cx));
-
-        // remove from connection pool
-        if let Some(pool) = this.pool.upgrade() {
-            if pool
-                .write()
-                .remove_client(this.db_user.clone(), *this.conn_id)
-            {
-                info!("closed connection removed");
-            }
-        }
-
-        Poll::Ready(())
-    }
+    Client::new(inner, conn_info, pool_clone)
 }

 struct ClientInner<C: ClientInnerExt> {
@@ -760,70 +686,72 @@ impl<C: ClientInnerExt> Deref for Client<C> {
 }

 impl<C: ClientInnerExt> Client<C> {
-    fn do_drop(&mut self) {
+    fn do_drop(&mut self) -> Option<impl FnOnce()> {
        let conn_info = self.conn_info.clone();
        let client = self
            .inner
            .take()
            .expect("client inner should not be removed");
-
-        let conn_id = client.conn_id;
-
-        if client.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return;
-        }
-
-        if let Some(conn_pool) = self.pool.upgrade() {
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
+            let current_span = self.span.clone();
            // return connection to the pool
-            let _span = self.span.enter();
-            EndpointConnPool::put(&conn_pool, &conn_info, client);
+            return Some(move || {
+                let _span = current_span.enter();
+                EndpointConnPool::put(&conn_pool, &conn_info, client);
+            });
        }
+        None
    }
 }

 impl<C: ClientInnerExt> Drop for Client<C> {
    fn drop(&mut self) {
-        self.do_drop();
+        if let Some(drop) = self.do_drop() {
+            tokio::task::spawn_blocking(drop);
+        }
    }
 }

 #[cfg(test)]
 mod tests {
+    use std::{mem, sync::atomic::AtomicBool};
+
    use crate::{BranchId, EndpointId, ProjectId};

    use super::*;

-    struct MockClient(CancellationToken);
+    struct MockClient(Arc<AtomicBool>);
+    impl MockClient {
+        fn new(is_closed: bool) -> Self {
+            MockClient(Arc::new(is_closed.into()))
+        }
+    }
    impl ClientInnerExt for MockClient {
        fn is_closed(&self) -> bool {
-            self.0.is_cancelled()
+            self.0.load(atomic::Ordering::Relaxed)
        }
        fn get_process_id(&self) -> i32 {
            0
        }
    }

-    fn create_inner(
-        global_pool: Arc<GlobalConnPool<MockClient>>,
-        conn_info: ConnInfo,
-    ) -> (Client<MockClient>, CancellationToken) {
-        let cancelled = CancellationToken::new();
-        let client = poll_client(
-            global_pool,
-            &mut RequestMonitoring::test(),
-            conn_info,
-            MockClient(cancelled.clone()),
-            cancelled.clone().cancelled_owned(),
-            uuid::Uuid::new_v4(),
-            MetricsAuxInfo {
+    fn create_inner() -> ClientInner<MockClient> {
+        create_inner_with(MockClient::new(false))
+    }
+
+    fn create_inner_with(client: MockClient) -> ClientInner<MockClient> {
+        ClientInner {
+            inner: client,
+            session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+            cancel: CancellationToken::new(),
+            aux: MetricsAuxInfo {
                endpoint_id: (&EndpointId::from("endpoint")).into(),
                project_id: (&ProjectId::from("project")).into(),
                branch_id: (&BranchId::from("branch")).into(),
                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
            },
-        );
-        (client, cancelled)
+            conn_id: uuid::Uuid::new_v4(),
+        }
    }

    #[tokio::test]
@@ -850,36 +778,51 @@ mod tests {
            dbname: "dbname".into(),
            password: "password".as_bytes().into(),
        };
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
        {
-            let (mut client, _) = create_inner(pool.clone(), conn_info.clone());
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
            assert_eq!(0, pool.get_global_connections_count());
            client.inner().1.discard();
-            drop(client);
            // Discard should not add the connection from the pool.
            assert_eq!(0, pool.get_global_connections_count());
        }
        {
-            let (client, _) = create_inner(pool.clone(), conn_info.clone());
-            drop(client);
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
            assert_eq!(1, pool.get_global_connections_count());
        }
        {
-            let (client, cancel) = create_inner(pool.clone(), conn_info.clone());
-            cancel.cancel();
-            drop(client);
-            // The closed client shouldn't be added to the pool.
+            let mut closed_client = Client::new(
+                create_inner_with(MockClient::new(true)),
+                conn_info.clone(),
+                ep_pool.clone(),
+            );
+            closed_client.do_drop().unwrap()();
+            mem::forget(closed_client); // drop the client
+                                        // The closed client shouldn't be added to the pool.
            assert_eq!(1, pool.get_global_connections_count());
        }
-        let cancel = {
-            let (client, cancel) = create_inner(pool.clone(), conn_info.clone());
-            drop(client);
+        let is_closed: Arc<AtomicBool> = Arc::new(false.into());
+        {
+            let mut client = Client::new(
+                create_inner_with(MockClient(is_closed.clone())),
+                conn_info.clone(),
+                ep_pool.clone(),
+            );
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
            // The client should be added to the pool.
            assert_eq!(2, pool.get_global_connections_count());
-            cancel
-        };
+        }
        {
-            let client = create_inner(pool.clone(), conn_info.clone());
-            drop(client);
+            let mut client = Client::new(create_inner(), conn_info, ep_pool);
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
            // The client shouldn't be added to the pool. Because the ep-pool is full.
            assert_eq!(2, pool.get_global_connections_count());
        }
@@ -893,19 +836,25 @@ mod tests {
            dbname: "dbname".into(),
            password: "password".as_bytes().into(),
        };
+        let ep_pool = Arc::downgrade(
+            &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
+        );
        {
-            let client = create_inner(pool.clone(), conn_info.clone());
-            drop(client);
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
            assert_eq!(3, pool.get_global_connections_count());
        }
        {
-            let client = create_inner(pool.clone(), conn_info.clone());
-            drop(client);
+            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
+            client.do_drop().unwrap()();
+            mem::forget(client); // drop the client
+
            // The client shouldn't be added to the pool. Because the global pool is full.
            assert_eq!(3, pool.get_global_connections_count());
        }

-        cancel.cancel();
+        is_closed.store(true, atomic::Ordering::Relaxed);
        // Do gc for all shards.
        pool.gc(0);
        pool.gc(1);
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -156,15 +156,17 @@ pub async fn serve_websocket(
        Err(e) => {
            // todo: log and push to ctx the error kind
            ctx.set_error_kind(e.get_error_kind());
+            ctx.log();
            Err(e.into())
        }
        Ok(None) => {
            ctx.set_success();
+            ctx.log();
            Ok(())
        }
        Ok(Some(p)) => {
            ctx.set_success();
-            ctx.log_connect();
+            ctx.log();
            p.proxy_pass().await
        }
    }
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,15 +22,9 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
-native-tls.workspace = true
-postgres-native-tls.workspace = true
-postgres_ffi.workspace = true
 tokio-stream.workspace = true
-tokio-postgres.workspace = true
-tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
-camino.workspace = true

 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -67,12 +67,10 @@ the purge command will log all the keys that it would have deleted.

 #### `scan-metadata`

-Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
-Errors are logged to stderr and summary to stdout.
+Walk objects in a pageserver S3 bucket, and report statistics on the contents.

-For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
+env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata

 Timelines: 31106
 With errors: 3
@@ -84,10 +82,6 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
 ```

-For safekeepers, dump_db_connstr and dump_db_table must be
-specified; they should point to table with debug dump which will be used
-to list timelines and find their backup and start LSNs.
-
 ## Cleaning up running pageservers

 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,13 +1,11 @@
-use chrono::{DateTime, Utc};
-use futures::Future;
-use hex::FromHex;
+use std::time::Duration;

+use chrono::{DateTime, Utc};
+use hex::FromHex;
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;

-use tokio_util::sync::CancellationToken;
-use utils::backoff;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -139,7 +137,7 @@ pub struct ProjectData {
    pub region_id: String,
    pub platform_id: String,
    pub user_id: String,
-    pub pageserver_id: Option<u64>,
+    pub pageserver_id: u64,
    #[serde(deserialize_with = "from_nullable_id")]
    pub tenant: TenantId,
    pub safekeepers: Vec<SafekeeperData>,
@@ -157,7 +155,7 @@ pub struct ProjectData {
    pub maintenance_set: Option<String>,
 }

-#[derive(Debug, Clone, serde::Deserialize)]
+#[derive(Debug, serde::Deserialize)]
 pub struct BranchData {
    pub id: BranchId,
    pub created_at: DateTime<Utc>,
@@ -212,39 +210,30 @@ impl CloudAdminApiClient {
            .await
            .expect("Semaphore is not closed");

-        let response = CloudAdminApiClient::with_retries(
-            || async {
-                let response = self
-                    .http_client
-                    .get(self.append_url("/projects"))
-                    .query(&[
-                        ("tenant_id", tenant_id.to_string()),
-                        ("show_deleted", "true".to_string()),
-                    ])
-                    .header(header::ACCEPT, "application/json")
-                    .bearer_auth(&self.token)
-                    .send()
-                    .await
-                    .map_err(|e| {
-                        Error::new(
-                            "Find project for tenant".to_string(),
-                            ErrorKind::RequestSend(e),
-                        )
-                    })?;
-
-                let response: AdminApiResponse<Vec<ProjectData>> =
-                    response.json().await.map_err(|e| {
-                        Error::new(
-                            "Find project for tenant".to_string(),
-                            ErrorKind::BodyRead(e),
-                        )
-                    })?;
-                Ok(response)
-            },
-            "find_tenant_project",
-        )
-        .await?;
+        let response = self
+            .http_client
+            .get(self.append_url("/projects"))
+            .query(&[
+                ("tenant_id", tenant_id.to_string()),
+                ("show_deleted", "true".to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| {
+                Error::new(
+                    "Find project for tenant".to_string(),
+                    ErrorKind::RequestSend(e),
+                )
+            })?;

+        let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
+            Error::new(
+                "Find project for tenant".to_string(),
+                ErrorKind::BodyRead(e),
+            )
+        })?;
        match response.data.len() {
            0 => Ok(None),
            1 => Ok(Some(
@@ -272,34 +261,42 @@ impl CloudAdminApiClient {
        const PAGINATION_LIMIT: usize = 512;
        let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
        loop {
-            let response_bytes = CloudAdminApiClient::with_retries(
-                || async {
-                    let response = self
-                        .http_client
-                        .get(self.append_url("/projects"))
-                        .query(&[
-                            ("show_deleted", "false".to_string()),
-                            ("limit", format!("{PAGINATION_LIMIT}")),
-                            ("offset", format!("{pagination_offset}")),
-                        ])
-                        .header(header::ACCEPT, "application/json")
-                        .bearer_auth(&self.token)
-                        .send()
-                        .await
-                        .map_err(|e| {
-                            Error::new(
-                                "List active projects".to_string(),
-                                ErrorKind::RequestSend(e),
-                            )
-                        })?;
+            let response = self
+                .http_client
+                .get(self.append_url("/projects"))
+                .query(&[
+                    ("show_deleted", "false".to_string()),
+                    ("limit", format!("{PAGINATION_LIMIT}")),
+                    ("offset", format!("{pagination_offset}")),
+                ])
+                .header(header::ACCEPT, "application/json")
+                .bearer_auth(&self.token)
+                .send()
+                .await
+                .map_err(|e| {
+                    Error::new(
+                        "List active projects".to_string(),
+                        ErrorKind::RequestSend(e),
+                    )
+                })?;

-                    response.bytes().await.map_err(|e| {
-                        Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
-                    })
-                },
-                "list_projects",
-            )
-            .await?;
+            match response.status() {
+                StatusCode::OK => {}
+                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
+                    tokio::time::sleep(Duration::from_millis(500)).await;
+                    continue;
+                }
+                _status => {
+                    return Err(Error::new(
+                        "List active projects".to_string(),
+                        ErrorKind::ResponseStatus(response.status()),
+                    ))
+                }
+            }
+
+            let response_bytes = response.bytes().await.map_err(|e| {
+                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
+            })?;

            let decode_result =
                serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -330,7 +327,6 @@ impl CloudAdminApiClient {

    pub async fn find_timeline_branch(
        &self,
-        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<Option<BranchData>, Error> {
        let _permit = self
@@ -339,61 +335,43 @@ impl CloudAdminApiClient {
            .await
            .expect("Semaphore is not closed");

-        let response = CloudAdminApiClient::with_retries(
-            || async {
-                let response = self
-                    .http_client
-                    .get(self.append_url("/branches"))
-                    .query(&[
-                        ("timeline_id", timeline_id.to_string()),
-                        ("show_deleted", "true".to_string()),
-                    ])
-                    .header(header::ACCEPT, "application/json")
-                    .bearer_auth(&self.token)
-                    .send()
-                    .await
-                    .map_err(|e| {
-                        Error::new(
-                            "Find branch for timeline".to_string(),
-                            ErrorKind::RequestSend(e),
-                        )
-                    })?;
+        let response = self
+            .http_client
+            .get(self.append_url("/branches"))
+            .query(&[
+                ("timeline_id", timeline_id.to_string()),
+                ("show_deleted", "true".to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| {
+                Error::new(
+                    "Find branch for timeline".to_string(),
+                    ErrorKind::RequestSend(e),
+                )
+            })?;

-                let response: AdminApiResponse<Vec<BranchData>> =
-                    response.json().await.map_err(|e| {
-                        Error::new(
-                            "Find branch for timeline".to_string(),
-                            ErrorKind::BodyRead(e),
-                        )
-                    })?;
-                Ok(response)
-            },
-            "find_timeline_branch",
-        )
-        .await?;
-
-        let mut branches: Vec<BranchData> = response.data.into_iter().collect();
-        // Normally timeline_id is unique. However, we do have at least one case
-        // of the same timeline_id in two different projects, apparently after
-        // manual recovery. So always recheck project_id (discovered through
-        // tenant_id).
-        let project_data = match self.find_tenant_project(tenant_id).await? {
-            Some(pd) => pd,
-            None => return Ok(None),
-        };
-        branches.retain(|b| b.project_id == project_data.id);
-        if branches.len() < 2 {
-            Ok(branches.first().cloned())
-        } else {
-            Err(Error::new(
-                format!(
-                    "Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
-                    tenant_id,
-                    timeline_id,
-                    branches.len()
-                ),
+        let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
+            Error::new(
+                "Find branch for timeline".to_string(),
+                ErrorKind::BodyRead(e),
+            )
+        })?;
+        match response.data.len() {
+            0 => Ok(None),
+            1 => Ok(Some(
+                response
+                    .data
+                    .into_iter()
+                    .next()
+                    .expect("Should have exactly one element"),
+            )),
+            too_many => Err(Error::new(
+                format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
                ErrorKind::UnexpectedState,
-            ))
+            )),
        }
    }

@@ -554,15 +532,4 @@ impl CloudAdminApiClient {
            .parse()
            .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
    }
-
-    async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
-    where
-        O: FnMut() -> F,
-        F: Future<Output = Result<T, Error>>,
-    {
-        let cancel = CancellationToken::new(); // not really used
-        backoff::retry(op, |_| false, 1, 20, description, &cancel)
-            .await
-            .expect("cancellations are disabled")
-    }
 }
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -60,7 +60,6 @@ pub struct GarbageList {
    /// see garbage, we saw some active tenants too.  This protects against classes of bugs
    /// in the scrubber that might otherwise generate a "deleted all" result.
    active_tenant_count: usize,
-    active_timeline_count: usize,
 }

 impl GarbageList {
@@ -68,7 +67,6 @@ impl GarbageList {
        Self {
            items: Vec::new(),
            active_tenant_count: 0,
-            active_timeline_count: 0,
            node_kind,
            bucket_config,
        }
@@ -121,10 +119,7 @@ pub async fn find_garbage(
 const S3_CONCURRENCY: usize = 32;

 // How many concurrent API requests to make to the console API.
-//
-// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
-// would be better to implement real rsp limiter.
-const CONSOLE_CONCURRENCY: usize = 16;
+const CONSOLE_CONCURRENCY: usize = 128;

 struct ConsoleCache {
    /// Set of tenants found in the control plane API
@@ -226,7 +221,6 @@ async fn find_garbage_inner(
        } else {
            tracing::debug!("Tenant {tenant_shard_id} is active");
            active_tenants.push(tenant_shard_id);
-            garbage.active_tenant_count = active_tenants.len();
        }

        counter += 1;
@@ -267,7 +261,7 @@ async fn find_garbage_inner(
        let api_client = cloud_admin_api_client.clone();
        async move {
            api_client
-                .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
+                .find_timeline_branch(ttid.timeline_id)
                .await
                .map_err(|e| anyhow::anyhow!(e))
                .map(|r| (ttid, r))
@@ -277,29 +271,15 @@ async fn find_garbage_inner(
        std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));

    // Update the GarbageList with any timelines which appear not to exist.
-    let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
    while let Some(result) = timelines_checked.next().await {
        let (ttid, console_result) = result?;
        if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
            tracing::debug!("Timeline {ttid} is garbage");
        } else {
            tracing::debug!("Timeline {ttid} is active");
-            active_timelines.push(ttid);
-            garbage.active_timeline_count = active_timelines.len();
        }
    }

-    let num_garbage_timelines = garbage
-        .items
-        .iter()
-        .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
-        .count();
-    tracing::info!(
-        "Found {}/{} garbage timelines in active tenants",
-        num_garbage_timelines,
-        active_timelines.len(),
-    );
-
    Ok(garbage)
 }

@@ -364,22 +344,16 @@ pub async fn get_timeline_objects(
 const MAX_KEYS_PER_DELETE: usize = 1000;

 /// Drain a buffer of keys into DeleteObjects requests
-///
-/// If `drain` is true, drains keys completely; otherwise stops when <
-/// MAX_KEYS_PER_DELETE keys are left.
-/// `num_deleted` returns number of deleted keys.
 async fn do_delete(
    s3_client: &Arc<Client>,
    bucket_name: &str,
    keys: &mut Vec<ObjectIdentifier>,
    dry_run: bool,
    drain: bool,
-    progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
    while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
        let request_keys =
            keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
-        let num_deleted = request_keys.len();
        if dry_run {
            tracing::info!("Dry-run deletion of objects: ");
            for k in request_keys {
@@ -394,30 +368,12 @@ async fn do_delete(
                .send()
                .await
                .context("DeleteObjects request")?;
-            progress_tracker.register(num_deleted);
        }
    }

    Ok(())
 }

-/// Simple tracker reporting each 10k deleted keys.
-#[derive(Default)]
-struct DeletionProgressTracker {
-    num_deleted: usize,
-    last_reported_num_deleted: usize,
-}
-
-impl DeletionProgressTracker {
-    fn register(&mut self, n: usize) {
-        self.num_deleted += n;
-        if self.num_deleted - self.last_reported_num_deleted > 10000 {
-            tracing::info!("progress: deleted {} keys", self.num_deleted);
-            self.last_reported_num_deleted = self.num_deleted;
-        }
-    }
-}
-
 pub async fn purge_garbage(
    input_path: String,
    mode: PurgeMode,
@@ -438,14 +394,6 @@ pub async fn purge_garbage(
    if garbage_list.active_tenant_count == 0 {
        anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
    }
-    if garbage_list
-        .items
-        .iter()
-        .any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
-        && garbage_list.active_timeline_count == 0
-    {
-        anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
-    }

    let filtered_items = garbage_list
        .items
@@ -481,7 +429,6 @@ pub async fn purge_garbage(
        std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));

    let mut objects_to_delete = Vec::new();
-    let mut progress_tracker = DeletionProgressTracker::default();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
        objects_to_delete.append(&mut object_list);
@@ -492,7 +439,6 @@ pub async fn purge_garbage(
                &mut objects_to_delete,
                dry_run,
                false,
-                &mut progress_tracker,
            )
            .await?;
        }
@@ -504,11 +450,10 @@ pub async fn purge_garbage(
        &mut objects_to_delete,
        dry_run,
        true,
-        &mut progress_tracker,
    )
    .await?;

-    tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
+    tracing::info!("Fell through");

    Ok(())
 }
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,9 +4,7 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
-pub mod scan_pageserver_metadata;
-pub mod scan_safekeeper_metadata;
-pub mod tenant_snapshot;
+pub mod scan_metadata;

 use std::env;
 use std::fmt::Display;
@@ -25,18 +23,17 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
 use aws_sdk_s3::{Client, Config};
 use aws_smithy_async::rt::sleep::TokioSleep;

-use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use std::io::IsTerminal;
 use tokio::io::AsyncReadExt;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
-use utils::fs_ext;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -142,34 +139,12 @@ impl RootTarget {
    pub fn tenants_root(&self) -> S3Target {
        match self {
            Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.clone(),
+            Self::Safekeeper(root) => root.with_sub_segment("wal"),
        }
    }

    pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
-        match self {
-            Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
-            Self::Safekeeper(_) => self
-                .tenants_root()
-                .with_sub_segment(&tenant_id.tenant_id.to_string()),
-        }
-    }
-
-    pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
-        // Only pageserver remote storage contains tenant-shards
-        assert!(matches!(self, Self::Pageserver(_)));
-        let Self::Pageserver(root) = self else {
-            panic!();
-        };
-
-        S3Target {
-            bucket_name: root.bucket_name.clone(),
-            prefix_in_bucket: format!(
-                "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}",
-                root.prefix_in_bucket
-            ),
-            delimiter: root.delimiter.clone(),
-        }
+        self.tenants_root().with_sub_segment(&tenant_id.to_string())
    }

    pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
@@ -265,6 +240,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
        .with_ansi(false)
        .with_writer(file_writer);
    let stderr_logs = fmt::Layer::new()
+        .with_ansi(std::io::stderr().is_terminal())
        .with_target(false)
        .with_writer(std::io::stderr);
    tracing_subscriber::registry()
@@ -343,7 +319,9 @@ fn init_remote(
        }),
        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
+            prefix_in_bucket: bucket_config
+                .prefix_in_bucket
+                .unwrap_or("safekeeper/v1".to_string()),
            delimiter,
        }),
    };
@@ -368,10 +346,7 @@ async fn list_objects_with_retries(
        {
            Ok(response) => return Ok(response),
            Err(e) => {
-                error!(
-                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
-                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
-                );
+                error!("list_objects_v2 query failed: {e}");
                tokio::time::sleep(Duration::from_secs(1)).await;
            }
        }
@@ -421,50 +396,3 @@ async fn download_object_with_retries(

    anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
 }
-
-async fn download_object_to_file(
-    s3_client: &Client,
-    bucket_name: &str,
-    key: &str,
-    version_id: Option<&str>,
-    local_path: &Utf8Path,
-) -> anyhow::Result<()> {
-    let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp"));
-    for _ in 0..MAX_RETRIES {
-        tokio::fs::remove_file(&tmp_path)
-            .await
-            .or_else(fs_ext::ignore_not_found)?;
-
-        let mut file = tokio::fs::File::create(&tmp_path)
-            .await
-            .context("Opening output file")?;
-
-        let request = s3_client.get_object().bucket(bucket_name).key(key);
-
-        let request = match version_id {
-            Some(version_id) => request.version_id(version_id),
-            None => request,
-        };
-
-        let response_stream = match request.send().await {
-            Ok(response) => response,
-            Err(e) => {
-                error!(
-                    "Failed to download object for key {key} version {}: {e:#}",
-                    version_id.unwrap_or("")
-                );
-                tokio::time::sleep(Duration::from_secs(1)).await;
-                continue;
-            }
-        };
-
-        let mut read_stream = response_stream.body.into_async_read();
-
-        tokio::io::copy(&mut read_stream, &mut file).await?;
-
-        tokio::fs::rename(&tmp_path, local_path).await?;
-        return Ok(());
-    }
-
-    anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
-}
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,16 +1,9 @@
-use anyhow::bail;
-use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::scan_pageserver_metadata::scan_metadata;
-use s3_scrubber::tenant_snapshot::SnapshotDownloader;
-use s3_scrubber::{
-    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
-    NodeKind, TraversingDepth,
-};
+use s3_scrubber::scan_metadata::scan_metadata;
+use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};

 use clap::{Parser, Subcommand};
-use utils::id::TenantId;

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -39,28 +32,11 @@ enum Command {
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
    },
-    #[command(verbatim_doc_comment)]
    ScanMetadata {
-        #[arg(short, long)]
-        node_kind: NodeKind,
        #[arg(short, long, default_value_t = false)]
        json: bool,
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
-        #[arg(long, default_value = None)]
-        /// For safekeeper node_kind only, points to db with debug dump
-        dump_db_connstr: Option<String>,
-        /// For safekeeper node_kind only, table in the db with debug dump
-        #[arg(long, default_value = None)]
-        dump_db_table: Option<String>,
-    },
-    TenantSnapshot {
-        #[arg(long = "tenant-id")]
-        tenant_id: TenantId,
-        #[arg(long = "concurrency", short = 'j', default_value_t = 8)]
-        concurrency: usize,
-        #[arg(short, long)]
-        output_path: Utf8PathBuf,
    },
 }

@@ -74,7 +50,6 @@ async fn main() -> anyhow::Result<()> {
        Command::ScanMetadata { .. } => "scan",
        Command::FindGarbage { .. } => "find-garbage",
        Command::PurgeGarbage { .. } => "purge-garbage",
-        Command::TenantSnapshot { .. } => "tenant-snapshot",
    };
    let _guard = init_logging(&format!(
        "{}_{}_{}_{}.log",
@@ -85,75 +60,33 @@ async fn main() -> anyhow::Result<()> {
    ));

    match cli.command {
-        Command::ScanMetadata {
-            json,
-            tenant_ids,
-            node_kind,
-            dump_db_connstr,
-            dump_db_table,
-        } => {
-            if let NodeKind::Safekeeper = node_kind {
-                let dump_db_connstr =
-                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
-                let dump_db_table =
-                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
-
-                let summary = scan_safekeeper_metadata(
-                    bucket_config.clone(),
-                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
-                    dump_db_connstr,
-                    dump_db_table,
-                )
-                .await?;
-                if json {
-                    println!("{}", serde_json::to_string(&summary).unwrap())
-                } else {
-                    println!("{}", summary.summary_string());
+        Command::ScanMetadata { json, tenant_ids } => {
+            match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                Err(e) => {
+                    tracing::error!("Failed: {e}");
+                    Err(e)
                }
-                if summary.is_fatal() {
-                    bail!("Fatal scrub errors detected");
-                }
-                if summary.is_empty() {
-                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                    // scrubber they were likely expecting to scan something, and if we see no timelines
-                    // at all then it's likely due to some configuration issues like a bad prefix
-                    bail!(
-                        "No timelines found in bucket {} prefix {}",
-                        bucket_config.bucket,
-                        bucket_config
-                            .prefix_in_bucket
-                            .unwrap_or("<none>".to_string())
-                    );
-                }
-                Ok(())
-            } else {
-                match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                    Err(e) => {
-                        tracing::error!("Failed: {e}");
-                        Err(e)
+                Ok(summary) => {
+                    if json {
+                        println!("{}", serde_json::to_string(&summary).unwrap())
+                    } else {
+                        println!("{}", summary.summary_string());
                    }
-                    Ok(summary) => {
-                        if json {
-                            println!("{}", serde_json::to_string(&summary).unwrap())
-                        } else {
-                            println!("{}", summary.summary_string());
-                        }
-                        if summary.is_fatal() {
-                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                        } else if summary.is_empty() {
-                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                            // scrubber they were likely expecting to scan something, and if we see no timelines
-                            // at all then it's likely due to some configuration issues like a bad prefix
-                            Err(anyhow::anyhow!(
-                                "No timelines found in bucket {} prefix {}",
-                                bucket_config.bucket,
-                                bucket_config
-                                    .prefix_in_bucket
-                                    .unwrap_or("<none>".to_string())
-                            ))
-                        } else {
-                            Ok(())
-                        }
+                    if summary.is_fatal() {
+                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                    } else if summary.is_empty() {
+                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                        // scrubber they were likely expecting to scan something, and if we see no timelines
+                        // at all then it's likely due to some configuration issues like a bad prefix
+                        Err(anyhow::anyhow!(
+                            "No timelines found in bucket {} prefix {}",
+                            bucket_config.bucket,
+                            bucket_config
+                                .prefix_in_bucket
+                                .unwrap_or("<none>".to_string())
+                        ))
+                    } else {
+                        Ok(())
                    }
                }
            }
@@ -169,14 +102,5 @@ async fn main() -> anyhow::Result<()> {
        Command::PurgeGarbage { input_path, mode } => {
            purge_garbage(input_path, mode, !cli.delete).await
        }
-        Command::TenantSnapshot {
-            tenant_id,
-            output_path,
-            concurrency,
-        } => {
-            let downloader =
-                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
-            downloader.download().await
-        }
    }
 }
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -5,7 +5,7 @@ use tokio_stream::Stream;

 use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
 use pageserver_api::shard::TenantShardId;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
@@ -45,62 +45,6 @@ pub fn stream_tenants<'a>(
    }
 }

-pub async fn stream_tenant_shards<'a>(
-    s3_client: &'a Client,
-    target: &'a RootTarget,
-    tenant_id: TenantId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
-    let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
-    let mut continuation_token = None;
-    let shards_target = target.tenant_shards_prefix(&tenant_id);
-
-    loop {
-        tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
-        let fetch_response =
-            list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
-        let fetch_response = match fetch_response {
-            Err(e) => {
-                tenant_shard_ids.push(Err(e));
-                break;
-            }
-            Ok(r) => r,
-        };
-
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&target.tenants_root().prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                let first_part = entry_id_str.split('/').next().unwrap();
-
-                first_part
-                    .parse::<TenantShardId>()
-                    .with_context(|| format!("Incorrect entry id str: {first_part}"))
-            });
-
-        for i in new_entry_ids {
-            tenant_shard_ids.push(i);
-        }
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
-
-    Ok(stream! {
-        for i in tenant_shard_ids {
-            let id = i?;
-            yield Ok(id);
-        }
-    })
-}
-
 /// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
 /// using ListObjectsv2.  The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
@@ -114,7 +58,7 @@ pub async fn stream_tenant_timelines<'a>(
    let timelines_target = target.timelines_root(&tenant);

    loop {
-        tracing::debug!("Listing in {}", tenant);
+        tracing::info!("Listing in {}", tenant);
        let fetch_response =
            list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
                .await;
@@ -151,7 +95,7 @@ pub async fn stream_tenant_timelines<'a>(
        }
    }

-    tracing::debug!("Yielding for {}", tenant);
+    tracing::info!("Yielding for {}", tenant);
    Ok(stream! {
        for i in timeline_ids {
            let id = i?;
--- a/s3_scrubber/src/scan_pageserver_metadata.rs
+++ b/s3_scrubber/src/scan_pageserver_metadata.rs
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,236 +0,0 @@
-use std::{collections::HashSet, str::FromStr};
-
-use aws_sdk_s3::Client;
-use futures::stream::{StreamExt, TryStreamExt};
-use pageserver_api::shard::TenantShardId;
-use postgres_ffi::{XLogFileName, PG_TLI};
-use serde::Serialize;
-use tokio_postgres::types::PgLsn;
-use tracing::{error, info, trace};
-use utils::{
-    id::{TenantId, TenantTimelineId, TimelineId},
-    lsn::Lsn,
-};
-
-use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
-};
-
-/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
-const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
-
-#[derive(Serialize)]
-pub struct MetadataSummary {
-    timeline_count: usize,
-    with_errors: HashSet<TenantTimelineId>,
-    deleted_count: usize,
-}
-
-impl MetadataSummary {
-    fn new() -> Self {
-        Self {
-            timeline_count: 0,
-            with_errors: HashSet::new(),
-            deleted_count: 0,
-        }
-    }
-
-    pub fn summary_string(&self) -> String {
-        format!(
-            "timeline_count: {}, with_errors: {}",
-            self.timeline_count,
-            self.with_errors.len()
-        )
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.timeline_count == 0
-    }
-
-    pub fn is_fatal(&self) -> bool {
-        !self.with_errors.is_empty()
-    }
-}
-
-/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
-/// statistics.
-///
-/// It works by listing timelines along with timeline_start_lsn and backup_lsn
-/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
-/// segments are missing, before complaining control plane is queried to check if
-/// the project wasn't deleted in the meanwhile.
-pub async fn scan_safekeeper_metadata(
-    bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantId>,
-    dump_db_connstr: String,
-    dump_db_table: String,
-) -> anyhow::Result<MetadataSummary> {
-    info!(
-        "checking bucket {}, region {}, dump_db_table {}",
-        bucket_config.bucket, bucket_config.region, dump_db_table
-    );
-    // Use the native TLS implementation (Neon requires TLS)
-    let tls_connector =
-        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
-    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
-    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    let tenant_filter_clause = if !tenant_ids.is_empty() {
-        format!(
-            "and tenant_id in ({})",
-            tenant_ids
-                .iter()
-                .map(|t| format!("'{}'", t))
-                .collect::<Vec<_>>()
-                .join(", ")
-        )
-    } else {
-        "".to_owned()
-    };
-    let query = format!(
-        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
-        dump_db_table, tenant_filter_clause,
-    );
-    info!("query is {}", query);
-    let timelines = client.query(&query, &[]).await?;
-    info!("loaded {} timelines", timelines.len());
-
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
-    let console_config = ConsoleConfig::from_env()?;
-    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
-
-    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
-        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
-        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
-        let timeline_start_lsn_pg: PgLsn = row.get(2);
-        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
-        let backup_lsn_pg: PgLsn = row.get(3);
-        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
-        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-        check_timeline(
-            &s3_client,
-            &target,
-            &cloud_admin_api_client,
-            ttid,
-            timeline_start_lsn,
-            backup_lsn,
-        )
-    });
-    // Run multiple check_timeline's concurrently.
-    const CONCURRENCY: usize = 32;
-    let mut timelines = checks.try_buffered(CONCURRENCY);
-
-    let mut summary = MetadataSummary::new();
-    while let Some(r) = timelines.next().await {
-        let res = r?;
-        summary.timeline_count += 1;
-        if !res.is_ok {
-            summary.with_errors.insert(res.ttid);
-        }
-        if res.is_deleted {
-            summary.deleted_count += 1;
-        }
-    }
-
-    Ok(summary)
-}
-
-struct TimelineCheckResult {
-    ttid: TenantTimelineId,
-    is_ok: bool,
-    is_deleted: bool, // timeline is deleted in cplane
-}
-
-/// List s3 and check that is has all expected WAL for the ttid. Consistency
-/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
-/// Ok(false) if not, Err if failed to check.
-async fn check_timeline(
-    s3_client: &Client,
-    root: &RootTarget,
-    api_client: &CloudAdminApiClient,
-    ttid: TenantTimelineId,
-    timeline_start_lsn: Lsn,
-    backup_lsn: Lsn,
-) -> anyhow::Result<TimelineCheckResult> {
-    trace!(
-        "checking ttid {}, should contain WAL [{}-{}]",
-        ttid,
-        timeline_start_lsn,
-        backup_lsn
-    );
-    // calculate expected segfiles
-    let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
-    let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
-    let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
-        (expected_first_segno..expected_last_segno)
-            .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
-    );
-    let expected_files_num = expected_segfiles.len();
-    trace!("expecting {} files", expected_segfiles.len(),);
-
-    // now list s3 and check if it misses something
-    let ttshid =
-        TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
-    let mut timeline_dir_target = root.timeline_root(&ttshid);
-    // stream_listing yields only common_prefixes if delimiter is not empty, but
-    // we need files, so unset it.
-    timeline_dir_target.delimiter = String::new();
-
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
-
-        let seg_name = key
-            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
-            .expect("failed to extract segment name");
-        expected_segfiles.remove(seg_name);
-    }
-    if !expected_segfiles.is_empty() {
-        // Before complaining check cplane, probably timeline is already deleted.
-        let bdata = api_client
-            .find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
-            .await?;
-        let deleted = match bdata {
-            Some(bdata) => bdata.deleted,
-            None => {
-                // note: should be careful with selecting proper cplane address
-                info!("ttid {} not found, assuming it is deleted", ttid);
-                true
-            }
-        };
-        if deleted {
-            // ok, branch is deleted
-            return Ok(TimelineCheckResult {
-                ttid,
-                is_ok: true,
-                is_deleted: true,
-            });
-        }
-        error!(
-            "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
-            ttid,
-            expected_segfiles.len(),
-            expected_files_num,
-            timeline_start_lsn,
-            backup_lsn,
-        );
-        return Ok(TimelineCheckResult {
-            ttid,
-            is_ok: false,
-            is_deleted: false,
-        });
-    }
-    Ok(TimelineCheckResult {
-        ttid,
-        is_ok: true,
-        is_deleted: false,
-    })
-}
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -1,293 +0,0 @@
-use std::collections::HashMap;
-use std::sync::Arc;
-
-use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
-use crate::{
-    download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
-};
-use anyhow::Context;
-use async_stream::stream;
-use aws_sdk_s3::Client;
-use camino::Utf8PathBuf;
-use futures::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
-use pageserver::tenant::IndexPart;
-use pageserver_api::shard::TenantShardId;
-use utils::generation::Generation;
-use utils::id::TenantId;
-
-pub struct SnapshotDownloader {
-    s3_client: Arc<Client>,
-    s3_root: RootTarget,
-    bucket_config: BucketConfig,
-    tenant_id: TenantId,
-    output_path: Utf8PathBuf,
-    concurrency: usize,
-}
-
-impl SnapshotDownloader {
-    pub fn new(
-        bucket_config: BucketConfig,
-        tenant_id: TenantId,
-        output_path: Utf8PathBuf,
-        concurrency: usize,
-    ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
-        Ok(Self {
-            s3_client,
-            s3_root,
-            bucket_config,
-            tenant_id,
-            output_path,
-            concurrency,
-        })
-    }
-
-    async fn download_layer(
-        &self,
-        ttid: TenantShardTimelineId,
-        layer_name: LayerFileName,
-        layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
-        // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
-        // different layer names (remote-style has the generation suffix)
-        let local_path = self.output_path.join(format!(
-            "{}/timelines/{}/{}{}",
-            ttid.tenant_shard_id,
-            ttid.timeline_id,
-            layer_name.file_name(),
-            layer_metadata.generation.get_suffix()
-        ));
-
-        // We should only be called for layers that are owned by the input TTID
-        assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index());
-
-        // Assumption: we always write layer files atomically, and layer files are immutable.  Therefore if the file
-        // already exists on local disk, we assume it is fully correct and skip it.
-        if tokio::fs::try_exists(&local_path).await? {
-            tracing::debug!("{} already exists", local_path);
-            return Ok((layer_name, layer_metadata));
-        } else {
-            tracing::debug!("{} requires download...", local_path);
-
-            let timeline_root = self.s3_root.timeline_root(&ttid);
-            let remote_layer_path = format!(
-                "{}{}{}",
-                timeline_root.prefix_in_bucket,
-                layer_name.file_name(),
-                layer_metadata.generation.get_suffix()
-            );
-
-            // List versions: the object might be deleted.
-            let versions = self
-                .s3_client
-                .list_object_versions()
-                .bucket(self.bucket_config.bucket.clone())
-                .prefix(&remote_layer_path)
-                .send()
-                .await?;
-            let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
-                return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
-            };
-            download_object_to_file(
-                &self.s3_client,
-                &self.bucket_config.bucket,
-                &remote_layer_path,
-                version.version_id.as_deref(),
-                &local_path,
-            )
-            .await?;
-
-            tracing::debug!("Downloaded successfully to {local_path}");
-        }
-
-        Ok((layer_name, layer_metadata))
-    }
-
-    /// Download many layers belonging to the same TTID, with some concurrency
-    async fn download_layers(
-        &self,
-        ttid: TenantShardTimelineId,
-        layers: Vec<(LayerFileName, IndexLayerMetadata)>,
-    ) -> anyhow::Result<()> {
-        let layer_count = layers.len();
-        tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
-        let layers_stream = stream! {
-            for (layer_name, layer_metadata) in layers {
-                yield self.download_layer(ttid, layer_name, layer_metadata);
-            }
-        };
-
-        tokio::fs::create_dir_all(self.output_path.join(format!(
-            "{}/timelines/{}",
-            ttid.tenant_shard_id, ttid.timeline_id
-        )))
-        .await?;
-
-        let layer_results = layers_stream.buffered(self.concurrency);
-        let mut layer_results = std::pin::pin!(layer_results);
-
-        let mut err = None;
-        let mut download_count = 0;
-        while let Some(i) = layer_results.next().await {
-            download_count += 1;
-            match i {
-                Ok((layer_name, layer_metadata)) => {
-                    tracing::info!(
-                        "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
-                        layer_metadata.file_size,
-                        layer_name.file_name()
-                    );
-                }
-                Err(e) => {
-                    // Warn and continue: we will download what we can
-                    tracing::warn!("Download error: {e}");
-                    err = Some(e);
-                }
-            }
-        }
-        if let Some(e) = err {
-            tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}");
-            Err(e)
-        } else {
-            Ok(())
-        }
-    }
-
-    async fn download_timeline(
-        &self,
-        ttid: TenantShardTimelineId,
-        index_part: IndexPart,
-        index_part_generation: Generation,
-        ancestor_layers: &mut HashMap<
-            TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
-        >,
-    ) -> anyhow::Result<()> {
-        let index_bytes = serde_json::to_string(&index_part).unwrap();
-
-        let layers = index_part
-            .layer_metadata
-            .into_iter()
-            .filter_map(|(layer_name, layer_metadata)| {
-                if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count {
-                    // Accumulate ancestor layers for later download
-                    let ancestor_ttid = TenantShardTimelineId::new(
-                        TenantShardId {
-                            tenant_id: ttid.tenant_shard_id.tenant_id,
-                            shard_number: layer_metadata.shard.shard_number,
-                            shard_count: layer_metadata.shard.shard_count,
-                        },
-                        ttid.timeline_id,
-                    );
-                    let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default();
-                    use std::collections::hash_map::Entry;
-                    match ancestor_ttid_layers.entry(layer_name) {
-                        Entry::Occupied(entry) => {
-                            // Descendent shards that reference a layer from an ancestor should always have matching metadata,
-                            // as their siblings, because it is read atomically during a shard split.
-                            assert_eq!(entry.get(), &layer_metadata);
-                        }
-                        Entry::Vacant(entry) => {
-                            entry.insert(layer_metadata);
-                        }
-                    }
-                    None
-                } else {
-                    Some((layer_name, layer_metadata))
-                }
-            })
-            .collect();
-
-        let download_result = self.download_layers(ttid, layers).await;
-
-        // Write index last, once all the layers it references are downloaded
-        let local_index_path = self.output_path.join(format!(
-            "{}/timelines/{}/index_part.json{}",
-            ttid.tenant_shard_id,
-            ttid.timeline_id,
-            index_part_generation.get_suffix()
-        ));
-        tokio::fs::write(&local_index_path, index_bytes)
-            .await
-            .context("writing index")?;
-
-        download_result
-    }
-
-    pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
-
-        // Generate a stream of TenantShardId
-        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
-        let shards: Vec<TenantShardId> = shards.try_collect().await?;
-
-        // Only read from shards that have the highest count: avoids redundantly downloading
-        // from ancestor shards.
-        let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else {
-            anyhow::bail!("No shards found");
-        };
-
-        // We will build a collection of layers in anccestor shards to download (this will only
-        // happen if this tenant has been split at some point)
-        let mut ancestor_layers: HashMap<
-            TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
-        > = Default::default();
-
-        for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
-            // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
-
-            // Generate a stream of S3TimelineBlobData
-            async fn load_timeline_index(
-                s3_client: &Client,
-                target: &RootTarget,
-                ttid: TenantShardTimelineId,
-            ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
-                let data = list_timeline_blobs(s3_client, ttid, target).await?;
-                Ok((ttid, data))
-            }
-            let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
-            let mut timelines = std::pin::pin!(timelines.try_buffered(8));
-
-            while let Some(i) = timelines.next().await {
-                let (ttid, data) = i?;
-                match data.blob_data {
-                    BlobDataParseResult::Parsed {
-                        index_part,
-                        index_part_generation,
-                        s3_layers: _,
-                    } => {
-                        self.download_timeline(
-                            ttid,
-                            index_part,
-                            index_part_generation,
-                            &mut ancestor_layers,
-                        )
-                        .await
-                        .context("Downloading timeline")?;
-                    }
-                    BlobDataParseResult::Relic => {}
-                    BlobDataParseResult::Incorrect(_) => {
-                        tracing::error!("Bad metadata in timeline {ttid}");
-                    }
-                };
-            }
-        }
-
-        for (ttid, layers) in ancestor_layers.into_iter() {
-            tracing::info!(
-                "Downloading {} layers from ancvestor timeline {ttid}...",
-                layers.len()
-            );
-
-            self.download_layers(ttid, layers.into_iter().collect())
-                .await?;
-        }
-
-        Ok(())
-    }
-}
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -177,10 +177,6 @@ struct Args {
    /// Controls how long backup will wait until uploading the partial segment.
    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
    partial_backup_timeout: Duration,
-    /// Disable task to push messages to broker every second. Supposed to
-    /// be used in tests.
-    #[arg(long)]
-    disable_periodic_broker_push: bool,
 }

 // Like PathBufValueParser, but allows empty string.
@@ -313,7 +309,6 @@ async fn main() -> anyhow::Result<()> {
        walsenders_keep_horizon: args.walsenders_keep_horizon,
        partial_backup_enabled: args.partial_backup_enabled,
        partial_backup_timeout: args.partial_backup_timeout,
-        disable_periodic_broker_push: args.disable_periodic_broker_push,
    };

    // initialize sentry if SENTRY_DSN is provided
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -10,20 +10,11 @@ use anyhow::Result;
 use storage_broker::parse_proto_ttid;

 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
-use storage_broker::proto::FilterTenantTimelineId;
-use storage_broker::proto::MessageType;
-use storage_broker::proto::SafekeeperDiscoveryResponse;
-use storage_broker::proto::SubscribeByFilterRequest;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
-use storage_broker::proto::TypeSubscription;
-use storage_broker::proto::TypedMessage;
 use storage_broker::Request;

-use std::sync::atomic::AtomicU64;
-use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
-use std::time::UNIX_EPOCH;
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use tracing::*;
@@ -40,12 +31,6 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;

 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    if conf.disable_periodic_broker_push {
-        info!("broker push_loop is disabled, doing nothing...");
-        futures::future::pending::<()>().await; // sleep forever
-        return Ok(());
-    }
-
    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -90,7 +75,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }

 /// Subscribe and fetch all the interesting data from the broker.
-async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;

    // TODO: subscribe only to local timelines instead of all
@@ -109,8 +94,6 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
    let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);

    while let Some(msg) = stream.message().await? {
-        stats.update_pulled();
-
        let proto_ttid = msg
            .tenant_timeline_id
            .as_ref()
@@ -136,93 +119,12 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
    bail!("end of stream");
 }

-/// Process incoming discover requests. This is done in a separate task to avoid
-/// interfering with the normal pull/push loops.
-async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
-    let mut client =
-        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
-
-    let request = SubscribeByFilterRequest {
-        types: vec![TypeSubscription {
-            r#type: MessageType::SafekeeperDiscoveryRequest as i32,
-        }],
-        tenant_timeline_id: Some(FilterTenantTimelineId {
-            enabled: false,
-            tenant_timeline_id: None,
-        }),
-    };
-
-    let mut stream = client
-        .subscribe_by_filter(request)
-        .await
-        .context("subscribe_by_filter request failed")?
-        .into_inner();
-
-    let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
-
-    while let Some(typed_msg) = stream.message().await? {
-        stats.update_pulled();
-
-        match typed_msg.r#type() {
-            MessageType::SafekeeperDiscoveryRequest => {
-                let msg = typed_msg
-                    .safekeeper_discovery_request
-                    .expect("proto type mismatch from broker message");
-
-                let proto_ttid = msg
-                    .tenant_timeline_id
-                    .as_ref()
-                    .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
-                let ttid = parse_proto_ttid(proto_ttid)?;
-                if let Ok(tli) = GlobalTimelines::get(ttid) {
-                    // we received a discovery request for a timeline we know about
-                    discover_counter.inc();
-
-                    // create and reply with discovery response
-                    let sk_info = tli.get_safekeeper_info(&conf).await;
-                    let response = SafekeeperDiscoveryResponse {
-                        safekeeper_id: sk_info.safekeeper_id,
-                        tenant_timeline_id: sk_info.tenant_timeline_id,
-                        commit_lsn: sk_info.commit_lsn,
-                        safekeeper_connstr: sk_info.safekeeper_connstr,
-                        availability_zone: sk_info.availability_zone,
-                    };
-
-                    // note this is a blocking call
-                    client
-                        .publish_one(TypedMessage {
-                            r#type: MessageType::SafekeeperDiscoveryResponse as i32,
-                            safekeeper_timeline_info: None,
-                            safekeeper_discovery_request: None,
-                            safekeeper_discovery_response: Some(response),
-                        })
-                        .await?;
-                }
-            }
-
-            _ => {
-                warn!(
-                    "unexpected message type i32 {}, {:?}",
-                    typed_msg.r#type,
-                    typed_msg.r#type()
-                );
-            }
-        }
-    }
-    bail!("end of stream");
-}
-
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    info!("started, broker endpoint {:?}", conf.broker_endpoint);

    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
    let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
    let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
-    let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
-
-    let stats = Arc::new(BrokerStats::new());
-    let stats_task = task_stats(stats.clone());
-    tokio::pin!(stats_task);

    // Selecting on JoinHandles requires some squats; is there a better way to
    // reap tasks individually?
@@ -251,77 +153,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                    };
                    pull_handle = None;
                },
-                res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
-                    // was it panic or normal error?
-                    match res {
-                        Ok(res_internal) => if let Err(err_inner) = res_internal {
-                            warn!("discover task failed: {:?}", err_inner);
-                        }
-                        Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
-                    };
-                    discover_handle = None;
-                },
                _ = ticker.tick() => {
                    if push_handle.is_none() {
                        push_handle = Some(tokio::spawn(push_loop(conf.clone())));
                    }
                    if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
                    }
-                    if discover_handle.is_none() {
-                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
-                    }
-                },
-                _ = &mut stats_task => {}
-        }
-    }
-}
-
-struct BrokerStats {
-    /// Timestamp of the last received message from the broker.
-    last_pulled_ts: AtomicU64,
-}
-
-impl BrokerStats {
-    fn new() -> Self {
-        BrokerStats {
-            last_pulled_ts: AtomicU64::new(0),
-        }
-    }
-
-    fn now_millis() -> u64 {
-        std::time::SystemTime::now()
-            .duration_since(UNIX_EPOCH)
-            .expect("time is before epoch")
-            .as_millis() as u64
-    }
-
-    /// Update last_pulled timestamp to current time.
-    fn update_pulled(&self) {
-        self.last_pulled_ts
-            .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-/// Periodically write to logs if there are issues with receiving data from the broker.
-async fn task_stats(stats: Arc<BrokerStats>) {
-    let warn_duration = Duration::from_secs(10);
-    let mut ticker = tokio::time::interval(warn_duration);
-
-    loop {
-        tokio::select! {
-            _ = ticker.tick() => {
-                let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
-                if last_pulled == 0 {
-                    // no broker updates yet
-                    continue;
-                }
-
-                let now = BrokerStats::now_millis();
-                if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
-                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
-                    info!("no broker updates for some time, last update: {:?}", ts);
-                }
            }
        }
    }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -83,7 +83,6 @@ pub struct SafeKeeperConf {
    pub walsenders_keep_horizon: bool,
    pub partial_backup_enabled: bool,
    pub partial_backup_timeout: Duration,
-    pub disable_periodic_broker_push: bool,
 }

 impl SafeKeeperConf {
@@ -130,7 +129,6 @@ impl SafeKeeperConf {
            walsenders_keep_horizon: false,
            partial_backup_enabled: false,
            partial_backup_timeout: Duration::from_secs(0),
-            disable_periodic_broker_push: false,
        }
    }
 }
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,7 +178,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        walsenders_keep_horizon: false,
        partial_backup_enabled: false,
        partial_backup_timeout: Duration::from_secs(0),
-        disable_periodic_broker_push: false,
    };

    let mut global = GlobalMap::new(disk, conf.clone())?;
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -196,13 +196,8 @@ impl SubscriptionKey {

    /// Parse from FilterTenantTimelineId
    pub fn from_proto_filter_tenant_timeline_id(
-        opt: Option<&FilterTenantTimelineId>,
+        f: &FilterTenantTimelineId,
    ) -> Result<Self, Status> {
-        if opt.is_none() {
-            return Ok(SubscriptionKey::All);
-        }
-
-        let f = opt.unwrap();
        if !f.enabled {
            return Ok(SubscriptionKey::All);
        }
@@ -539,7 +534,10 @@ impl BrokerService for Broker {
            .remote_addr()
            .expect("TCPConnectInfo inserted by handler");
        let proto_filter = request.into_inner();
-        let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
+        let ttid_filter = proto_filter
+            .tenant_timeline_id
+            .as_ref()
+            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;

        let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
        let types_set = proto_filter
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -3,13 +3,11 @@ use std::{collections::HashMap, time::Duration};

 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
-use futures::StreamExt;
 use hyper::{Method, StatusCode};
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, Instrument};
 use utils::{
    backoff::{self},
    id::{NodeId, TenantId},
@@ -422,37 +420,48 @@ impl ComputeHook {
        .and_then(|x| x)
    }

-    /// Synchronous phase: update the per-tenant state for the next intended notification
-    fn notify_prepare(
+    /// Call this to notify the compute (postgres) tier of new pageservers to use
+    /// for a tenant.  notify() is called by each shard individually, and this function
+    /// will decide whether an update to the tenant is sent.  An update is sent on the
+    /// condition that:
+    /// - We know a pageserver for every shard.
+    /// - All the shards have the same shard_count (i.e. we are not mid-split)
+    ///
+    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
+    /// that is cancelled.
+    ///
+    /// This function is fallible, including in the case that the control plane is transiently
+    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
+    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
+    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
+    /// the proper pageserver nodes for a tenant.
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
+    pub(super) async fn notify(
        &self,
        tenant_shard_id: TenantShardId,
        node_id: NodeId,
        stripe_size: ShardStripeSize,
-    ) -> MaybeSendResult {
-        let mut state_locked = self.state.lock().unwrap();
-
-        use std::collections::hash_map::Entry;
-        let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                tenant_shard_id,
-                stripe_size,
-                node_id,
-            )),
-            Entry::Occupied(e) => {
-                let tenant = e.into_mut();
-                tenant.update(tenant_shard_id, stripe_size, node_id);
-                tenant
-            }
-        };
-        tenant.maybe_send(tenant_shard_id.tenant_id, None)
-    }
-
-    async fn notify_execute(
-        &self,
-        maybe_send_result: MaybeSendResult,
-        tenant_shard_id: TenantShardId,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
+        let maybe_send_result = {
+            let mut state_locked = self.state.lock().unwrap();
+
+            use std::collections::hash_map::Entry;
+            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                    tenant_shard_id,
+                    stripe_size,
+                    node_id,
+                )),
+                Entry::Occupied(e) => {
+                    let tenant = e.into_mut();
+                    tenant.update(tenant_shard_id, stripe_size, node_id);
+                    tenant
+                }
+            };
+            tenant.maybe_send(tenant_shard_id.tenant_id, None)
+        };
+
        // Process result: we may get an update to send, or we may have to wait for a lock
        // before trying again.
        let (request, mut send_lock_guard) = match maybe_send_result {
@@ -460,12 +469,7 @@ impl ComputeHook {
                return Ok(());
            }
            MaybeSendResult::AwaitLock(send_lock) => {
-                let send_locked = tokio::select! {
-                    guard = send_lock.lock_owned() => {guard},
-                    _ = cancel.cancelled() => {
-                        return Err(NotifyError::ShuttingDown)
-                    }
-                };
+                let send_locked = send_lock.lock_owned().await;

                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
@@ -504,94 +508,6 @@ impl ComputeHook {
        }
        result
    }
-
-    /// Infallible synchronous fire-and-forget version of notify(), that sends its results to
-    /// a channel.  Something should consume the channel and arrange to try notifying again
-    /// if something failed.
-    pub(super) fn notify_background(
-        self: &Arc<Self>,
-        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
-        result_tx: tokio::sync::mpsc::Sender<Result<(), (TenantShardId, NotifyError)>>,
-        cancel: &CancellationToken,
-    ) {
-        let mut maybe_sends = Vec::new();
-        for (tenant_shard_id, node_id, stripe_size) in notifications {
-            let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
-            maybe_sends.push((tenant_shard_id, maybe_send_result))
-        }
-
-        let this = self.clone();
-        let cancel = cancel.clone();
-
-        tokio::task::spawn(async move {
-            // Construct an async stream of futures to invoke the compute notify function: we do this
-            // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.  The
-            // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which
-            // would mostly just be waiting on that semaphore.
-            let mut stream = futures::stream::iter(maybe_sends)
-                .map(|(tenant_shard_id, maybe_send_result)| {
-                    let this = this.clone();
-                    let cancel = cancel.clone();
-
-                    async move {
-                        this
-                            .notify_execute(maybe_send_result, tenant_shard_id, &cancel)
-                            .await.map_err(|e| (tenant_shard_id, e))
-                    }.instrument(info_span!(
-                        "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
-                    ))
-                })
-                .buffered(API_CONCURRENCY);
-
-            loop {
-                tokio::select! {
-                    next = stream.next() => {
-                        match next {
-                            Some(r) => {
-                                result_tx.send(r).await.ok();
-                            },
-                            None => {
-                                tracing::info!("Finished sending background compute notifications");
-                                break;
-                            }
-                        }
-                    },
-                    _ = cancel.cancelled() => {
-                        tracing::info!("Shutdown while running background compute notifications");
-                        break;
-                    }
-                };
-            }
-        });
-    }
-
-    /// Call this to notify the compute (postgres) tier of new pageservers to use
-    /// for a tenant.  notify() is called by each shard individually, and this function
-    /// will decide whether an update to the tenant is sent.  An update is sent on the
-    /// condition that:
-    /// - We know a pageserver for every shard.
-    /// - All the shards have the same shard_count (i.e. we are not mid-split)
-    ///
-    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
-    /// that is cancelled.
-    ///
-    /// This function is fallible, including in the case that the control plane is transiently
-    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
-    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
-    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
-    /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
-    pub(super) async fn notify(
-        &self,
-        tenant_shard_id: TenantShardId,
-        node_id: NodeId,
-        stripe_size: ShardStripeSize,
-        cancel: &CancellationToken,
-    ) -> Result<(), NotifyError> {
-        let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
-        self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
-            .await
-    }
 }

 #[cfg(test)]
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -184,19 +184,6 @@ impl HeartbeaterTask {
                }
            }
        }
-        tracing::info!(
-            "Heartbeat round complete for {} nodes, {} offline",
-            new_state.len(),
-            new_state
-                .values()
-                .filter(|s| match s {
-                    PageserverState::Available { .. } => {
-                        false
-                    }
-                    PageserverState::Offline => true,
-                })
-                .count()
-        );

        let mut deltas = Vec::new();
        let now = Instant::now();
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -522,18 +522,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }

-async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state.service.tenant_import(tenant_id).await?,
-    )
-}
-
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -771,13 +759,6 @@ pub fn make_router(
        .post("/debug/v1/node/:node_id/drop", |r| {
            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
        })
-        .post("/debug/v1/tenant/:tenant_id/import", |r| {
-            named_request_span(
-                r,
-                handle_tenant_import,
-                RequestName("debug_v1_tenant_import"),
-            )
-        })
        .get("/debug/v1/tenant", |r| {
            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
        })
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,14 +1,13 @@
 use pageserver_api::{
    models::{
        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
    },
    shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
-use utils::id::{NodeId, TenantId, TimelineId};
+use utils::id::{NodeId, TimelineId};

 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -89,18 +88,6 @@ impl PageserverClient {
        )
    }

-    pub(crate) async fn tenant_scan_remote_storage(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantScanRemoteStorageResponse> {
-        measured_request!(
-            "tenant_scan_remote_storage",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.tenant_scan_remote_storage(tenant_id).await
-        )
-    }
-
    pub(crate) async fn tenant_secondary_download(
        &self,
        tenant_id: TenantShardId,
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -767,10 +767,7 @@ impl Reconciler {
                // It is up to the caller whether they want to drop out on this error, but they don't have to:
                // in general we should avoid letting unavailability of the cloud control plane stop us from
                // making progress.
-                if !matches!(e, NotifyError::ShuttingDown) {
-                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
-                }
-
+                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
                // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                // needs to retry at some point.
                self.compute_notify_failure = true;
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8,7 +8,6 @@ use std::{
 };

 use crate::{
-    compute_hook::NotifyError,
    id_lock_map::IdLockMap,
    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
@@ -62,7 +61,7 @@ use utils::{
 };

 use crate::{
-    compute_hook::ComputeHook,
+    compute_hook::{self, ComputeHook},
    heartbeater::{Heartbeater, PageserverState},
    node::{AvailabilityTransition, Node},
    persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
@@ -90,11 +89,7 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

-/// How long a node may be unresponsive to heartbeats before we declare it offline.
-/// This must be long enough to cover node restarts as well as normal operations: in future
-/// it should be separated into distinct timeouts for startup vs. normal operation
-/// (`<https://github.com/neondatabase/neon/issues/7552>`)
-pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);

 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;

@@ -115,42 +110,6 @@ struct ServiceState {
    delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }

-/// Transform an error from a pageserver into an error to return to callers of a storage
-/// controller API.
-fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
-    match e {
-        mgmt_api::Error::ReceiveErrorBody(str) => {
-            // Presume errors receiving body are connectivity/availability issues
-            ApiError::ResourceUnavailable(
-                format!("{node} error receiving error body: {str}").into(),
-            )
-        }
-        mgmt_api::Error::ReceiveBody(str) => {
-            // Presume errors receiving body are connectivity/availability issues
-            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
-        }
-        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
-            ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())
-        }
-        mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => {
-            ApiError::ResourceUnavailable(format!("{node}: {msg}").into())
-        }
-        mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg)
-        | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => {
-            // Auth errors talking to a pageserver are not auth errors for the caller: they are
-            // internal server errors, showing that something is wrong with the pageserver or
-            // storage controller's auth configuration.
-            ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}"))
-        }
-        mgmt_api::Error::ApiError(status, msg) => {
-            // Presume general case of pageserver API errors is that we tried to do something
-            // that can't be done right now.
-            ApiError::Conflict(format!("{node} {status}: {status} {msg}"))
-        }
-        mgmt_api::Error::Cancelled => ApiError::ShuttingDown,
-    }
-}
-
 impl ServiceState {
    fn new(
        nodes: HashMap<NodeId, Node>,
@@ -337,12 +296,7 @@ impl Service {
    /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
    /// view of the world, and determine which pageservers are responsive.
    #[instrument(skip_all)]
-    async fn startup_reconcile(
-        self: &Arc<Service>,
-        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
-            Result<(), (TenantShardId, NotifyError)>,
-        >,
-    ) {
+    async fn startup_reconcile(self: &Arc<Service>) {
        // For all tenant shards, a vector of observed states on nodes (where None means
        // indeterminate, same as in [`ObservedStateLocation`])
        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
@@ -361,6 +315,10 @@ impl Service {
            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
            .expect("Reconcile timeout is a modest constant");

+        let compute_notify_deadline = start_at
+            .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3)
+            .expect("Reconcile timeout is a modest constant");
+
        // Accumulate a list of any tenant locations that ought to be detached
        let mut cleanup = Vec::new();

@@ -386,7 +344,6 @@ impl Service {
        let mut compute_notifications = Vec::new();

        // Populate intent and observed states for all tenants, based on reported state on pageservers
-        tracing::info!("Populating tenant shards' states from initial pageserver scan...");
        let shard_count = {
            let mut locked = self.inner.write().unwrap();
            let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -453,27 +410,28 @@ impl Service {
        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
        // will emit compute hook notifications when they reconcile.
        //
-        // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later
-        // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later
-        // calls will be correctly ordered wrt these.
-        //
-        // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them
-        // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore
-        // unit and start doing I/O.
-        tracing::info!(
-            "Sending {} compute notifications",
-            compute_notifications.len()
-        );
-        self.compute_hook.notify_background(
-            compute_notifications,
-            bg_compute_notify_result_tx.clone(),
-            &self.cancel,
-        );
+        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
+        // tenants named here, because otherwise our calls to notify() might race with more recent values
+        // generated by reconciliation.
+        let notify_failures = self
+            .compute_notify_many(compute_notifications, compute_notify_deadline)
+            .await;
+
+        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
+        // flag on these shards that they have a pending notification.
+        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for tenant_shard_id in notify_failures.into_iter() {
+                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }

        // Finally, now that the service is up and running, launch reconcile operations for any tenants
        // which require it: under normal circumstances this should only include tenants that were in some
        // transient state before we restarted, or any tenants whose compute hooks failed above.
-        tracing::info!("Checking for shards in need of reconciliation...");
        let reconcile_tasks = self.reconcile_all();
        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
        // normal operations may proceed.
@@ -514,7 +472,6 @@ impl Service {
            }
        }

-        tracing::info!("Sending initial heartbeats...");
        let res = self
            .heartbeater
            .heartbeat(Arc::new(nodes_to_heartbeat))
@@ -551,7 +508,6 @@ impl Service {

        let mut node_list_futs = FuturesUnordered::new();

-        tracing::info!("Scanning shards on {} nodes...", nodes.len());
        for node in nodes.values() {
            node_list_futs.push({
                async move {
@@ -671,6 +627,72 @@ impl Service {
        }
    }

+    /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications.
+    ///
+    /// Returns a set of any shards for which notifications where not acked within the deadline.
+    async fn compute_notify_many(
+        &self,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        deadline: Instant,
+    ) -> HashSet<TenantShardId> {
+        let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
+        let mut success_shards = HashSet::new();
+
+        // Construct an async stream of futures to invoke the compute notify function: we do this
+        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
+        let mut stream = futures::stream::iter(notifications.into_iter())
+            .map(|(tenant_shard_id, node_id, stripe_size)| {
+                let compute_hook = self.compute_hook.clone();
+                let cancel = self.cancel.clone();
+                async move {
+                    if let Err(e) = compute_hook
+                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
+                        .await
+                    {
+                        tracing::error!(
+                            %tenant_shard_id,
+                            %node_id,
+                            "Failed to notify compute on startup for shard: {e}"
+                        );
+                        None
+                    } else {
+                        Some(tenant_shard_id)
+                    }
+                }
+            })
+            .buffered(compute_hook::API_CONCURRENCY);
+
+        loop {
+            tokio::select! {
+                next = stream.next() => {
+                    match next {
+                        Some(Some(success_shard)) => {
+                            // A notification succeeded
+                            success_shards.insert(success_shard);
+                            },
+                        Some(None) => {
+                            // A notification that failed
+                        },
+                        None => {
+                            tracing::info!("Successfully sent all compute notifications");
+                            break;
+                        }
+                    }
+                },
+                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                    // Give up sending any that didn't succeed yet
+                    tracing::info!("Reached deadline while sending compute notifications");
+                    break;
+                }
+            };
+        }
+
+        attempt_shards
+            .difference(&success_shards)
+            .cloned()
+            .collect()
+    }
+
    /// Long running background task that periodically wakes up and looks for shards that need
    /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
    /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
@@ -802,7 +824,8 @@ impl Service {

                // Ordering: populate last_error before advancing error_seq,
                // so that waiters will see the correct error after waiting.
-                tenant.set_last_error(result.sequence, e);
+                *(tenant.last_error.lock().unwrap()) = format!("{e}");
+                tenant.error_waiter.advance(result.sequence);

                for (node_id, o) in result.observed.locations {
                    tenant.observed.locations.insert(node_id, o);
@@ -829,45 +852,23 @@ impl Service {
    async fn process_results(
        &self,
        mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
-        mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver<
-            Result<(), (TenantShardId, NotifyError)>,
-        >,
    ) {
        loop {
            // Wait for the next result, or for cancellation
-            tokio::select! {
+            let result = tokio::select! {
                r = result_rx.recv() => {
                    match r {
-                        Some(result) => {self.process_result(result);},
+                        Some(result) => {result},
                        None => {break;}
                    }
                }
-                _ = async{
-                    match bg_compute_hook_result_rx.recv().await {
-                        Some(result) => {
-                            if let Err((tenant_shard_id, notify_error)) = result {
-                                tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}");
-                                let mut locked = self.inner.write().unwrap();
-                                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
-                                    shard.pending_compute_notification = true;
-                                }
-
-                            }
-                        },
-                        None => {
-                            // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown
-                            self.cancel.cancelled().await;
-                        }
-                    }
-                } => {},
                _ = self.cancel.cancelled() => {
                    break;
                }
            };
-        }

-        // We should only fall through on shutdown
-        assert!(self.cancel.is_cancelled());
+            self.process_result(result);
+        }
    }

    async fn process_aborts(
@@ -1028,10 +1029,6 @@ impl Service {

        let (startup_completion, startup_complete) = utils::completion::channel();

-        // This channel is continuously consumed by process_results, so doesn't need to be very large.
-        let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) =
-            tokio::sync::mpsc::channel(512);
-
        let (delayed_reconcile_tx, delayed_reconcile_rx) =
            tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES);

@@ -1069,9 +1066,7 @@ impl Service {
        tokio::task::spawn(async move {
            // Block shutdown until we're done (we must respect self.cancel)
            if let Ok(_gate) = result_task_this.gate.enter() {
-                result_task_this
-                    .process_results(result_rx, bg_compute_notify_result_rx)
-                    .await
+                result_task_this.process_results(result_rx).await
            }
        });

@@ -1113,7 +1108,7 @@ impl Service {
                    return;
                };

-                this.startup_reconcile(bg_compute_notify_result_tx).await;
+                this.startup_reconcile().await;
                drop(startup_completion);
            }
        });
@@ -2525,7 +2520,17 @@ impl Service {
            client
                .timeline_create(tenant_shard_id, &create_req)
                .await
-                .map_err(|e| passthrough_api_error(&node, e))
+                .map_err(|e| match e {
+                    mgmt_api::Error::ApiError(status, msg)
+                        if status == StatusCode::INTERNAL_SERVER_ERROR
+                            || status == StatusCode::NOT_ACCEPTABLE =>
+                    {
+                        // TODO: handle more error codes, e.g. 503 should be passed through.  Make a general wrapper
+                        // for pass-through API calls.
+                        ApiError::InternalServerError(anyhow::anyhow!(msg))
+                    }
+                    _ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
+                })
        }

        // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
@@ -2800,14 +2805,7 @@ impl Service {
                tenant_shard_id: shard.tenant_shard_id,
                node_attached: *shard.intent.get_attached(),
                node_secondary: shard.intent.get_secondary().to_vec(),
-                last_error: shard
-                    .last_error
-                    .lock()
-                    .unwrap()
-                    .as_ref()
-                    .map(|e| format!("{e}"))
-                    .unwrap_or("".to_string())
-                    .clone(),
+                last_error: shard.last_error.lock().unwrap().clone(),
                is_reconciling: shard.reconciler.is_some(),
                is_pending_compute_notification: shard.pending_compute_notification,
                is_splitting: matches!(shard.splitting, SplitState::Splitting),
@@ -3650,88 +3648,6 @@ impl Service {
        Ok(())
    }

-    /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a
-    /// tenant with a very high generation number so that it will see the existing data.
-    pub(crate) async fn tenant_import(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantCreateResponse, ApiError> {
-        // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage
-        let maybe_node = {
-            self.inner
-                .read()
-                .unwrap()
-                .nodes
-                .values()
-                .find(|n| n.is_available())
-                .cloned()
-        };
-        let Some(node) = maybe_node else {
-            return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available")));
-        };
-
-        let client = PageserverClient::new(
-            node.get_id(),
-            node.base_url(),
-            self.config.jwt_token.as_deref(),
-        );
-
-        let scan_result = client
-            .tenant_scan_remote_storage(tenant_id)
-            .await
-            .map_err(|e| passthrough_api_error(&node, e))?;
-
-        // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count.
-        let Some(shard_count) = scan_result
-            .shards
-            .iter()
-            .map(|s| s.tenant_shard_id.shard_count)
-            .max()
-        else {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("No shards found").into(),
-            ));
-        };
-
-        // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient
-        // to
-        let generation = scan_result
-            .shards
-            .iter()
-            .map(|s| s.generation)
-            .max()
-            .expect("We already validated >0 shards");
-
-        // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
-        // only work if they were using the default stripe size.
-        let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
-
-        let (response, waiters) = self
-            .do_tenant_create(TenantCreateRequest {
-                new_tenant_id: TenantShardId::unsharded(tenant_id),
-                generation,
-
-                shard_parameters: ShardParameters {
-                    count: shard_count,
-                    stripe_size,
-                },
-                placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
-
-                // There is no way to know what the tenant's config was: revert to defaults
-                config: TenantConfig::default(),
-            })
-            .await?;
-
-        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
-            // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this
-            // tenant doesn't exist in the control plane), so don't fail the request if it can't fully
-            // reconcile, as reconciliation includes notifying compute.
-            tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})");
-        }
-
-        Ok(response)
-    }
-
    /// For debug/support: a full JSON dump of TenantShards.  Returns a response so that
    /// we don't have to make TenantShard clonable in the return path.
    pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
@@ -4115,7 +4031,7 @@ impl Service {
                // TODO: in the background, we should balance work back onto this pageserver
            }
            AvailabilityTransition::Unchanged => {
-                tracing::debug!("Node {} no change during config", node_id);
+                tracing::info!("Node {} no change during config", node_id);
            }
        }

@@ -4255,9 +4171,7 @@ impl Service {
    /// Check all tenants for pending reconciliation work, and reconcile those in need.
    /// Additionally, reschedule tenants that require it.
    ///
-    /// Returns how many reconciliation tasks were started, or `1` if no reconciles were
-    /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
-    /// available.  A return value of 0 indicates that everything is fully reconciled already.
+    /// Returns how many reconciliation tasks were started
    fn reconcile_all(&self) -> usize {
        let mut locked = self.inner.write().unwrap();
        let (nodes, tenants, _scheduler) = locked.parts_mut();
@@ -4272,11 +4186,7 @@ impl Service {
            }

            // Skip checking if this shard is already enqueued for reconciliation
-            if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
-                // If there is something delayed, then return a nonzero count so that
-                // callers like reconcile_all_now do not incorrectly get the impression
-                // that the system is in a quiescent state.
-                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+            if shard.delayed_reconcile {
                continue;
            }

@@ -4441,27 +4351,8 @@ impl Service {
        };

        let waiter_count = waiters.len();
-        match self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
-            Ok(()) => {}
-            Err(ReconcileWaitError::Failed(_, reconcile_error))
-                if matches!(*reconcile_error, ReconcileError::Cancel) =>
-            {
-                // Ignore reconciler cancel errors: this reconciler might have shut down
-                // because some other change superceded it.  We will return a nonzero number,
-                // so the caller knows they might have to call again to quiesce the system.
-            }
-            Err(e) => {
-                return Err(e);
-            }
-        };
-
-        tracing::info!(
-            "{} reconciles in reconcile_all, {} waiters",
-            reconciles_spawned,
-            waiter_count
-        );
-
-        Ok(std::cmp::max(waiter_count, reconciles_spawned))
+        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+        Ok(waiter_count)
    }

    pub async fn shutdown(&self) {
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -38,18 +38,12 @@ use crate::{
 };

 /// Serialization helper
-fn read_last_error<S, T>(v: &std::sync::Mutex<Option<T>>, serializer: S) -> Result<S::Ok, S::Error>
+fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
 where
    S: serde::ser::Serializer,
-    T: std::fmt::Display,
+    T: Clone + std::fmt::Display,
 {
-    serializer.collect_str(
-        &v.lock()
-            .unwrap()
-            .as_ref()
-            .map(|e| format!("{e}"))
-            .unwrap_or("".to_string()),
-    )
+    serializer.collect_str(&v.lock().unwrap())
 }

 /// In-memory state for a particular tenant shard.
@@ -117,15 +111,11 @@ pub(crate) struct TenantShard {
    #[serde(skip)]
    pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,

-    /// The most recent error from a reconcile on this tenant.  This is a nested Arc
-    /// because:
-    ///  - ReconcileWaiters need to Arc-clone the overall object to read it later
-    ///  - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
-    ///    many waiters for one shard, and the underlying error types are not Clone.
+    /// The most recent error from a reconcile on this tenant
    /// TODO: generalize to an array of recent events
    /// TOOD: use a ArcSwap instead of mutex for faster reads?
-    #[serde(serialize_with = "read_last_error")]
-    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
+    #[serde(serialize_with = "read_mutex_content")]
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,

    /// If we have a pending compute notification that for some reason we weren't able to send,
    /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
@@ -303,18 +293,18 @@ pub(crate) struct ReconcilerWaiter {

    seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
    error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-    error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
+    error: std::sync::Arc<std::sync::Mutex<String>>,
    seq: Sequence,
 }

 #[derive(thiserror::Error, Debug)]
-pub(crate) enum ReconcileWaitError {
+pub enum ReconcileWaitError {
    #[error("Timeout waiting for shard {0}")]
    Timeout(TenantShardId),
    #[error("shutting down")]
    Shutdown,
    #[error("Reconcile error on shard {0}: {1}")]
-    Failed(TenantShardId, Arc<ReconcileError>),
+    Failed(TenantShardId, String),
 }

 #[derive(Eq, PartialEq, Debug)]
@@ -352,8 +342,7 @@ impl ReconcilerWaiter {
                    SeqWaitError::Timeout => unreachable!()
                })?;

-                return Err(ReconcileWaitError::Failed(self.tenant_shard_id,
-                    self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone()))
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
            }
        }

@@ -884,7 +873,7 @@ impl TenantShard {
            active_nodes_dirty || dirty_observed || self.pending_compute_notification;

        if !do_reconcile {
-            tracing::debug!("Not dirty, no reconciliation needed.");
+            tracing::info!("Not dirty, no reconciliation needed.");
            return ReconcileNeeded::No;
        }

@@ -952,8 +941,8 @@ impl TenantShard {

    /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
    ///
-    /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
-    /// you would like to wait on the next reconciler that gets spawned in the background.
+    /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
+    /// you would like to wait until one gets spawned in the background.
    pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
        self.ensure_sequence_ahead();

@@ -1162,13 +1151,6 @@ impl TenantShard {
        &self.scheduling_policy
    }

-    pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) {
-        // Ordering: always set last_error before advancing sequence, so that sequence
-        // waiters are guaranteed to see a Some value when they see an error.
-        *(self.last_error.lock().unwrap()) = Some(Arc::new(error));
-        self.error_waiter.advance(sequence);
-    }
-
    pub(crate) fn from_persistent(
        tsp: TenantShardPersistence,
        intent: IntentState,
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -14,18 +14,10 @@ class ComputeReconfigure:
        self.server = server
        self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
        self.workloads = {}
-        self.on_notify = None

    def register_workload(self, workload):
        self.workloads[workload.tenant_id] = workload

-    def register_on_notify(self, fn):
-        """
-        Add some extra work during a notification, like sleeping to slow things down, or
-        logging what was notified.
-        """
-        self.on_notify = fn
-

@pytest.fixture(scope="function")
 def compute_reconfigure_listener(make_httpserver):
@@ -51,9 +43,6 @@ def compute_reconfigure_listener(make_httpserver):
        body: dict[str, Any] = request.json
        log.info(f"notify-attach request: {body}")

-        if self.on_notify is not None:
-            self.on_notify(body)
-
        try:
            workload = self.workloads[TenantId(body["tenant_id"])]
        except KeyError:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -499,7 +499,6 @@ class NeonEnvBuilder:
        self.config_init_force: Optional[str] = None
        self.top_output_dir = top_output_dir
        self.control_plane_compute_hook_api: Optional[str] = None
-        self.storage_controller_config: Optional[dict[Any, Any]] = None

        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine

@@ -513,11 +512,6 @@ class NeonEnvBuilder:
            self.pageserver_get_impl = "vectored"
            log.debug('Overriding pageserver get_impl config to "vectored"')

-        self.pageserver_validate_vectored_get: Optional[bool] = None
-        if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
-            self.pageserver_validate_vectored_get = bool(validate)
-            log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
-
        assert test_name.startswith(
            "test_"
        ), "Unexpectedly instantiated from outside a test function"
@@ -1022,7 +1016,6 @@ class NeonEnv:
        self.pg_distrib_dir = config.pg_distrib_dir
        self.endpoint_counter = 0
        self.pageserver_config_override = config.pageserver_config_override
-        self.storage_controller_config = config.storage_controller_config

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
@@ -1068,9 +1061,6 @@ class NeonEnv:
        if self.control_plane_compute_hook_api is not None:
            cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api

-        if self.storage_controller_config is not None:
-            cfg["storage_controller"] = self.storage_controller_config
-
        # Create config for pageserver
        http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
        pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1095,8 +1085,6 @@ class NeonEnv:
                ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
            if config.pageserver_get_impl is not None:
                ps_cfg["get_impl"] = config.pageserver_get_impl
-            if config.pageserver_validate_vectored_get is not None:
-                ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get

            # Create a corresponding NeonPageserver object
            self.pageservers.append(
@@ -1139,9 +1127,12 @@ class NeonEnv:
        # bounce through retries on startup
        self.storage_controller.start()

+        def storage_controller_ready():
+            assert self.storage_controller.ready() is True
+
        # Wait for storage controller readiness to prevent unnecessary post start-up
        # reconcile.
-        self.storage_controller.wait_until_ready()
+        wait_until(30, 1, storage_controller_ready)

        # Start up broker, pageserver and all safekeepers
        futs = []
@@ -1584,11 +1575,6 @@ class NeonCli(AbstractNeonCli):
        res.check_returncode()
        return tenant_id, timeline_id

-    def import_tenant(self, tenant_id: TenantId):
-        args = ["tenant", "import", "--tenant-id", str(tenant_id)]
-        res = self.raw_cli(args)
-        res.check_returncode()
-
    def set_default(self, tenant_id: TenantId):
        """
        Update default tenant for future operations that require tenant_id.
@@ -2045,15 +2031,6 @@ class NeonStorageController(MetricsGetter):
        else:
            raise RuntimeError(f"Unexpected status {status} from readiness endpoint")

-    def wait_until_ready(self):
-        t1 = time.time()
-
-        def storage_controller_ready():
-            assert self.ready() is True
-
-        wait_until(30, 1, storage_controller_ready)
-        return time.time() - t1
-
    def attach_hook_issue(
        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
    ) -> int:
@@ -2141,7 +2118,7 @@ class NeonStorageController(MetricsGetter):
        shard_count: Optional[int] = None,
        shard_stripe_size: Optional[int] = None,
        tenant_config: Optional[Dict[Any, Any]] = None,
-        placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
+        placement_policy: Optional[str] = None,
    ):
        """
        Use this rather than pageserver_api() when you need to include shard parameters
@@ -2230,13 +2207,6 @@ class NeonStorageController(MetricsGetter):
            headers=self.headers(TokenScope.ADMIN),
        )

-    def tenant_import(self, tenant_id: TenantId):
-        self.request(
-            "POST",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-
    def reconcile_all(self):
        r = self.request(
            "POST",
@@ -2251,21 +2221,10 @@ class NeonStorageController(MetricsGetter):
    def reconcile_until_idle(self, timeout_secs=30):
        start_at = time.time()
        n = 1
-        delay_sec = 0.5
-        delay_max = 5
        while n > 0:
            n = self.reconcile_all()
-            if n == 0:
-                break
-            elif time.time() - start_at > timeout_secs:
+            if time.time() - start_at > timeout_secs:
                raise RuntimeError("Timeout in reconcile_until_idle")
-            else:
-                # Don't call again right away: if we're waiting for many reconciles that
-                # are blocked on the concurrency limit, it slows things down to call
-                # reconcile_all frequently.
-                time.sleep(delay_sec)
-                delay_sec *= 2
-                delay_sec = min(delay_sec, delay_max)

    def consistency_check(self):
        """
@@ -2339,24 +2298,20 @@ class NeonPageserver(PgProtocol):
        # The entries in the list are regular experessions.
        self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)

-    def timeline_dir(
-        self,
-        tenant_shard_id: Union[TenantId, TenantShardId],
-        timeline_id: Optional[TimelineId] = None,
-    ) -> Path:
+    def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
        """Get a timeline directory's path based on the repo directory of the test environment"""
        if timeline_id is None:
-            return self.tenant_dir(tenant_shard_id) / "timelines"
-        return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id)
+            return self.tenant_dir(tenant_id) / "timelines"
+        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)

    def tenant_dir(
        self,
-        tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None,
+        tenant_id: Optional[TenantId] = None,
    ) -> Path:
        """Get a tenant directory's path based on the repo directory of the test environment"""
-        if tenant_shard_id is None:
+        if tenant_id is None:
            return self.workdir / "tenants"
-        return self.workdir / "tenants" / str(tenant_shard_id)
+        return self.workdir / "tenants" / str(tenant_id)

    def start(
        self,
@@ -2543,10 +2498,8 @@ class NeonPageserver(PgProtocol):
        client = self.http_client()
        return client.tenant_location_conf(tenant_id, config, **kwargs)

-    def read_tenant_location_conf(
-        self, tenant_shard_id: Union[TenantId, TenantShardId]
-    ) -> dict[str, Any]:
-        path = self.tenant_dir(tenant_shard_id) / "config-v1"
+    def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
+        path = self.tenant_dir(tenant_id) / "config-v1"
        log.info(f"Reading location conf from {path}")
        bytes = open(path, "r").read()
        try:
@@ -3750,15 +3703,13 @@ class S3Scrubber:
            log.warning(f"Scrub environment: {env}")
            log.warning(f"Output at: {output_path}")

-            raise RuntimeError(f"Scrubber failed while running {args}")
+            raise RuntimeError("Remote storage scrub failed")

        assert stdout is not None
        return stdout

    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(
-            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
-        )
+        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)

        try:
            return json.loads(stdout)
@@ -3767,13 +3718,6 @@ class S3Scrubber:
            log.error(stdout)
            raise

-    def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
-        stdout = self.scrubber_cli(
-            ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
-            timeout=30,
-        )
-        log.info(f"tenant-snapshot output: {stdout}")
-

 def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
    """Compute the path to a working directory for an individual test."""
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,11 +252,8 @@ class S3Storage:

        log.info(f"deleted {cnt} objects from remote storage")

-    def tenants_path(self) -> str:
-        return f"{self.prefix_in_bucket}/tenants"
-
    def tenant_path(self, tenant_id: TenantId) -> str:
-        return f"{self.tenants_path()}/{tenant_id}"
+        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"

    def heatmap_key(self, tenant_id: TenantId) -> str:
        return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
@@ -265,9 +262,6 @@ class S3Storage:
        r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
        return json.loads(r["Body"].read().decode("utf-8"))

-    def mock_remote_tenant_path(self, tenant_id: TenantId):
-        assert self.real is False
-

 RemoteStorage = Union[LocalFsStorage, S3Storage]

--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -156,11 +156,7 @@ class TenantShardId:
            raise ValueError(f"Invalid TenantShardId '{input}'")

    def __str__(self):
-        if self.shard_count > 0:
-            return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
-        else:
-            # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id)
-            return str(self.tenant_id)
+        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"

    def __repr__(self):
        return self.__str__()
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -1,198 +0,0 @@
-import concurrent.futures
-import random
-import time
-
-import pytest
-from fixtures.compute_reconfigure import ComputeReconfigure
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
-from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TenantShardId, TimelineId
-
-
-@pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
-def test_storage_controller_many_tenants(
-    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
-):
-    """
-    Check that we cope well with a not-totally-trivial number of tenants.
-
-    This is checking for:
-    - Obvious concurrency bugs from issuing many tenant creations/modifications
-      concurrently.
-    - Obvious scaling bugs like O(N^2) scaling that would be so slow that even
-      a basic test starts failing from slowness.
-
-    This is _not_ a comprehensive scale test: just a basic sanity check that
-    we don't fall over for a thousand shards.
-    """
-
-    neon_env_builder.num_pageservers = 5
-    neon_env_builder.storage_controller_config = {
-        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
-        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
-        # guard against regressions in restart time.
-        "max_unavailable": "300s"
-    }
-    neon_env_builder.control_plane_compute_hook_api = (
-        compute_reconfigure_listener.control_plane_compute_hook_api
-    )
-
-    # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
-    compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
-
-    env = neon_env_builder.init_start()
-
-    # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
-    # of shards are hitting the delayed path.
-    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
-
-    for ps in env.pageservers:
-        # This can happen because when we do a loop over all pageservers and mark them offline/active,
-        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
-        # bumping generation before other attachments are detached.
-        #
-        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
-        # we spawn with a wait for the predecessor.
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
-        # Storage controller is allowed to drop pageserver requests when the cancellation token
-        # for a Reconciler fires.
-        ps.allowed_errors.append(".*request was dropped before completing.*")
-
-    # Total tenants
-    tenant_count = 4000
-
-    # Shards per tenant
-    shard_count = 2
-    stripe_size = 1024
-
-    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
-
-    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
-
-    def check_memory():
-        # Shards should be cheap_ in memory, as we will have very many of them
-        expect_memory_per_shard = 128 * 1024
-
-        rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
-        assert rss is not None
-        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
-        assert rss < expect_memory_per_shard * shard_count * tenant_count
-
-    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
-    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
-    rng = random.Random(1234)
-
-    # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
-    # permits, to ensure that we are exercising stressing that.
-    api_concurrency = 135
-
-    # We will create tenants directly via API, not via neon_local, to avoid any false
-    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
-    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
-        futs = []
-        t1 = time.time()
-        for tenant_id in tenants:
-            f = executor.submit(
-                env.storage_controller.tenant_create,
-                tenant_id,
-                shard_count,
-                stripe_size,
-                placement_policy={"Attached": 1},
-            )
-            futs.append(f)
-
-        # Wait for creations to finish
-        for f in futs:
-            f.result()
-        log.info(
-            f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
-        )
-
-        run_ops = api_concurrency * 4
-        assert run_ops < len(tenants)
-        op_tenants = list(tenants)[0:run_ops]
-
-        # Generate a mixture of operations and dispatch them all concurrently
-        futs = []
-        for tenant_id in op_tenants:
-            op = rng.choice([0, 1, 2])
-            if op == 0:
-                # A fan-out write operation to all shards in a tenant (timeline creation)
-                f = executor.submit(
-                    virtual_ps_http.timeline_create,
-                    PgVersion.NOT_SET,
-                    tenant_id,
-                    TimelineId.generate(),
-                )
-            elif op == 1:
-                # A reconciler operation: migrate a shard.
-                shard_number = rng.randint(0, shard_count - 1)
-                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
-                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
-                f = executor.submit(
-                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
-                )
-            elif op == 2:
-                # A passthrough read to shard zero
-                f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
-
-            futs.append(f)
-
-        # Wait for mixed ops to finish
-        for f in futs:
-            f.result()
-
-    # Consistency check is safe here: all the previous operations waited for reconcile before completing
-    env.storage_controller.consistency_check()
-    check_memory()
-
-    # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
-    # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
-    # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
-    #
-    # We do not require that the system is quiescent already here, although at present in this point in the test
-    # that may be the case.
-    while True:
-        t1 = time.time()
-        reconcilers = env.storage_controller.reconcile_all()
-        if reconcilers == 0:
-            # Time how long a no-op background reconcile takes: this measures how long it takes to
-            # loop over all the shards looking for work to do.
-            runtime = time.time() - t1
-            log.info(f"No-op call to reconcile_all took {runtime}s")
-            assert runtime < 1
-            break
-
-    # Restart the storage controller
-    env.storage_controller.stop()
-    env.storage_controller.start()
-
-    # See how long the controller takes to pass its readiness check.  This should be fast because
-    # all the nodes are online: offline pageservers are the only thing that's allowed to delay
-    # startup.
-    readiness_period = env.storage_controller.wait_until_ready()
-    assert readiness_period < 5
-
-    # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
-    # to run, as it was in a stable state before restart.  If it did, that's a bug.
-    env.storage_controller.consistency_check()
-    check_memory()
-
-    # Restart pageservers: this exercises the /re-attach API
-    for pageserver in env.pageservers:
-        pageserver.stop()
-        pageserver.start()
-
-    # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
-    # as they were not offline long enough to trigger any scheduling changes.
-    env.storage_controller.consistency_check()
-    check_memory()
-
-    # Stop the storage controller before tearing down fixtures, because it otherwise might log
-    # errors trying to call our `ComputeReconfigure`.
-    env.storage_controller.stop()
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,7 +190,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "trace_read_requests": True,
        "walreceiver_connect_timeout": "13m",
        "image_layer_creation_check_threshold": 1,
-        "switch_to_aux_file_v2": True,
    }

    ps_http = env.pageserver.http_client()
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,6 +1,4 @@
-import json
 import os
-from typing import Optional

 import pytest
 from fixtures.log_helper import log
@@ -16,6 +14,8 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
    # Compact small layers
    "compaction_target_size": 1024**2,
    "image_creation_threshold": 2,
+    # INC-186: remove when merging the fix
+    "image_layer_creation_check_threshold": 0,
 }


@@ -91,102 +91,3 @@ page_cache_size=10
    # was chosen empirically for this workload.
    assert non_vectored_average < 8
    assert vectored_average < 8
-
-
-# Stripe sizes in number of pages.
-TINY_STRIPES = 16
-LARGE_STRIPES = 32768
-
-
-@pytest.mark.parametrize(
-    "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)]
-)
-def test_sharding_compaction(
-    neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int]
-):
-    """
-    Use small stripes, small layers, and small compaction thresholds to exercise how compaction
-    and image layer generation interacts with sharding.
-
-    We are looking for bugs that might emerge from the way sharding uses sparse layer files that
-    only contain some of the keys in the key range covered by the layer, such as errors estimating
-    the size of layers that might result in too-small layer files.
-    """
-
-    compaction_target_size = 128 * 1024
-
-    TENANT_CONF = {
-        # small checkpointing and compaction targets to ensure we generate many upload operations
-        "checkpoint_distance": f"{128 * 1024}",
-        "compaction_threshold": "1",
-        "compaction_target_size": f"{compaction_target_size}",
-        # no PITR horizon, we specify the horizon when we request on-demand GC
-        "pitr_interval": "0s",
-        # disable background compaction and GC. We invoke it manually when we want it to happen.
-        "gc_period": "0s",
-        "compaction_period": "0s",
-        # create image layers eagerly: we want to exercise image layer creation in this test.
-        "image_creation_threshold": "1",
-        "image_layer_creation_check_threshold": 0,
-    }
-
-    neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
-    env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF,
-        initial_tenant_shard_count=shard_count,
-        initial_tenant_shard_stripe_size=stripe_size,
-    )
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    workload = Workload(env, tenant_id, timeline_id)
-    workload.init()
-    workload.write_rows(64)
-    for _i in range(0, 10):
-        # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
-        # these should result in image layers each time we write some data into a shard, and also shards
-        # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
-        # rather than asserting)
-        workload.churn_rows(64)
-
-    # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes
-    # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job.
-    shard_has_image_layers = []
-    for shard in env.storage_controller.locate(tenant_id):
-        pageserver = env.get_pageserver(shard["node_id"])
-        shard_id = shard["shard_id"]
-        layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
-        image_layer_sizes = {}
-        for layer in layer_map.historic_layers:
-            if layer.kind == "Image":
-                image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
-
-                # Pageserver should assert rather than emit an empty layer file, but double check here
-                assert layer.layer_file_size is not None
-                assert layer.layer_file_size > 0
-
-        shard_has_image_layers.append(len(image_layer_sizes) > 1)
-        log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}")
-
-        if stripe_size == TINY_STRIPES:
-            # Checking the average size validates that our keyspace partitioning is  properly respecting sharding: if
-            # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical
-            # data in a keyrange.
-            #
-            # We only do this check with tiny stripes, because large stripes may not give all shards enough
-            # data to have statistically significant image layers
-            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)  # type: ignore
-            log.info(f"Shard {shard_id} average image layer size: {avg_size}")
-            assert avg_size > compaction_target_size / 2
-
-    if stripe_size == TINY_STRIPES:
-        # Expect writes were scattered across all pageservers: they should all have compacted some image layers
-        assert all(shard_has_image_layers)
-    else:
-        # With large stripes, it is expected that most of our writes went to one pageserver, so we just require
-        # that at least one of them has some image layers.
-        assert any(shard_has_image_layers)
-
-    # Assert that everything is still readable
-    workload.validate()
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -228,9 +228,8 @@ def test_forward_compatibility(
    try:
        # Previous version neon_local and pageserver are not aware
        # of the new config.
-        # TODO: remove these once the previous version of neon local supports them
+        # TODO: remove this once the code reaches main
        neon_env_builder.pageserver_get_impl = None
-        neon_env_builder.pageserver_validate_vectored_get = None

        neon_env_builder.num_safekeepers = 3
        neon_local_binpath = neon_env_builder.neon_binpath
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -3,7 +3,7 @@ import re
 import time

 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup


 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -102,80 +102,3 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
            ) as secondary2:
                wait_replica_caughtup(primary, secondary1)
                wait_replica_caughtup(primary, secondary2)
-
-
-# We had an issue that a standby server made GetPage requests with an
-# old LSN, based on the last-written LSN cache, to avoid waits in the
-# pageserver.  However, requesting a page with a very old LSN, such
-# that the GC horizon has already advanced past it, results in an
-# error from the pageserver:
-# "Bad request: tried to request a page version that was garbage collected"
-#
-# To avoid that, the compute<-> pageserver protocol was updated so
-# that that the standby now sends two LSNs, the old last-written LSN
-# and the current replay LSN.
-#
-# https://github.com/neondatabase/neon/issues/6211
-def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
-    tenant_conf = {
-        # set PITR interval to be small, so we can do GC
-        "pitr_interval": "0 s",
-    }
-    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
-    timeline_id = env.initial_timeline
-    tenant_id = env.initial_tenant
-
-    with env.endpoints.create_start(
-        branch_name="main",
-        endpoint_id="primary",
-    ) as primary:
-        with env.endpoints.new_replica_start(
-            origin=primary,
-            endpoint_id="secondary",
-            # Protocol version 2 was introduced to fix the issue
-            # that this test exercises. With protocol version 1 it
-            # fails.
-            config_lines=["neon.protocol_version=2"],
-        ) as secondary:
-            p_cur = primary.connect().cursor()
-            p_cur.execute("CREATE EXTENSION neon_test_utils")
-            p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)")
-            p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g")
-
-            wait_replica_caughtup(primary, secondary)
-
-            s_cur = secondary.connect().cursor()
-
-            s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
-            res = s_cur.fetchone()
-            assert res is not None
-
-            s_cur.execute("SELECT COUNT(*) FROM test")
-            res = s_cur.fetchone()
-            assert res[0] == 10000
-
-            # Clear the cache in the standby, so that when we
-            # re-execute the query, it will make GetPage
-            # requests. This does not clear the last-written LSN cache
-            # so we still remember the LSNs of the pages.
-            s_cur.execute("SELECT clear_buffer_cache()")
-
-            # Do other stuff on the primary, to advance the WAL
-            p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
-
-            # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
-            # very close to the primary's current insert LSN.
-            shards = tenant_get_shards(env, tenant_id, None)
-            for tenant_shard_id, pageserver in shards:
-                client = pageserver.http_client()
-                client.timeline_checkpoint(tenant_shard_id, timeline_id)
-                client.timeline_compact(tenant_shard_id, timeline_id)
-                client.timeline_gc(tenant_shard_id, timeline_id, 0)
-
-            # Re-execute the query. The GetPage requests that this
-            # generates use old not_modified_since LSNs, older than
-            # the GC cutoff, but new request LSNs. (In protocol
-            # version 1 there was only one LSN, and this failed.)
-            s_cur.execute("SELECT COUNT(*) FROM test")
-            res = s_cur.fetchone()
-            assert res[0] == 10000
--- a/test_runner/regress/test_ondemand_slru_download.py
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -1,131 +0,0 @@
-from typing import Optional
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards
-from fixtures.types import Lsn
-from fixtures.utils import query_scalar
-
-
-#
-# Test on-demand download of the pg_xact SLRUs
-#
-@pytest.mark.parametrize("shard_count", [None, 4])
-def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
-    if shard_count is not None:
-        neon_env_builder.num_pageservers = shard_count
-
-    tenant_conf = {
-        "lazy_slru_download": "true",
-        # set PITR interval to be small, so we can do GC
-        "pitr_interval": "0 s",
-    }
-    env = neon_env_builder.init_start(
-        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
-    )
-
-    timeline_id = env.initial_timeline
-    tenant_id = env.initial_tenant
-    endpoint = env.endpoints.create_start("main")
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("CREATE EXTENSION neon_test_utils")
-
-    # Create a test table
-    cur.execute("CREATE TABLE clogtest (id integer)")
-    cur.execute("INSERT INTO clogtest VALUES (1)")
-
-    # Consume a lot of XIDs, to create more pg_xact segments
-    for _ in range(1000):
-        cur.execute("select test_consume_xids(10000);")
-    cur.execute("INSERT INTO clogtest VALUES (2)")
-    for _ in range(1000):
-        cur.execute("select test_consume_xids(10000);")
-    cur.execute("INSERT INTO clogtest VALUES (2)")
-    for _ in range(1000):
-        cur.execute("select test_consume_xids(10000);")
-    cur.execute("INSERT INTO clogtest VALUES (3)")
-
-    # Restart postgres. After restart, the new instance will download the
-    # pg_xact segments lazily.
-    endpoint.stop()
-    endpoint.start()
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    # Consume more WAL, so that the pageserver can compact and GC older data,
-    # including the LSN that we started the new endpoint at,
-    cur.execute("CREATE TABLE anothertable (i int, t text)")
-    cur.execute(
-        "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g"
-    )
-
-    # Run GC
-    shards = tenant_get_shards(env, tenant_id, None)
-    for tenant_shard_id, pageserver in shards:
-        client = pageserver.http_client()
-        client.timeline_checkpoint(tenant_shard_id, timeline_id)
-        client.timeline_compact(tenant_shard_id, timeline_id)
-        client.timeline_gc(tenant_shard_id, timeline_id, 0)
-
-    # Test that this can still on-demand download the old pg_xact segments
-    cur.execute("select xmin, xmax, * from clogtest")
-    tup = cur.fetchall()
-    log.info(f"tuples = {tup}")
-
-
-@pytest.mark.parametrize("shard_count", [None, 4])
-def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
-    if shard_count is not None:
-        neon_env_builder.num_pageservers = shard_count
-
-    tenant_conf = {
-        "lazy_slru_download": "true",
-    }
-    env = neon_env_builder.init_start(
-        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
-    )
-
-    endpoint = env.endpoints.create_start("main")
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("CREATE EXTENSION neon_test_utils")
-
-    # Create a test table
-    cur.execute("CREATE TABLE clogtest (id integer)")
-    cur.execute("INSERT INTO clogtest VALUES (1)")
-
-    # Consume a lot of XIDs, to create more pg_xact segments
-    for _ in range(1000):
-        cur.execute("select test_consume_xids(10000);")
-
-    # Open a new connection and insert another row, but leave
-    # the transaction open
-    pg_conn2 = endpoint.connect()
-    cur2 = pg_conn2.cursor()
-    cur2.execute("BEGIN")
-    cur2.execute("INSERT INTO clogtest VALUES (2)")
-
-    # Another insert on the first connection, which is committed.
-    for _ in range(1000):
-        cur.execute("select test_consume_xids(10000);")
-    cur.execute("INSERT INTO clogtest VALUES (3)")
-
-    # Start standby at this point in time
-    lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
-    endpoint_at_lsn = env.endpoints.create_start(
-        branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn
-    )
-
-    # Commit transaction 2, after the standby was launched.
-    cur2.execute("COMMIT")
-
-    # The replica should not see transaction 2 as committed.
-    conn_replica = endpoint_at_lsn.connect()
-    cur_replica = conn_replica.cursor()
-    cur_replica.execute("SELECT * FROM clogtest")
-    assert cur_replica.fetchall() == [(1,), (3,)]
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -17,14 +17,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
    env = neon_simple_env
    env.neon_cli.create_branch("test_read_validation", "empty")

-    endpoint = env.endpoints.create_start(
-        "test_read_validation",
-        # Use protocol version 2, because the code that constructs the V1 messages
-        # assumes that a primary always wants to read the latest version of a page,
-        # and therefore doesn't work with the test functions below to read an older
-        # page version.
-        config_lines=["neon.protocol_version=2"],
-    )
+    endpoint = env.endpoints.create_start("test_read_validation")

    with closing(endpoint.connect()) as con:
        with con.cursor() as c:
@@ -71,7 +64,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
            log.info("Cache is clear, reading stale page version")

            c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))"
            )
            direct_first = c.fetchone()
            assert first == direct_first, "Failed fetch page at historic lsn"
@@ -84,7 +77,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
            log.info("Cache is clear, reading latest page version without cache")

            c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))"
            )
            direct_latest = c.fetchone()
            assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -99,7 +92,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
            )

            c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
            )
            direct_first = c.fetchone()
            assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -109,7 +102,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
            )

            c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))"
            )
            direct_latest = c.fetchone()
            assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -121,7 +114,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
            )

            c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
            )
            direct_first = c.fetchone()
            assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -140,14 +133,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):

    env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")

-    endpoint = env.endpoints.create_start(
-        "test_read_validation_neg",
-        # Use protocol version 2, because the code that constructs the V1 messages
-        # assumes that a primary always wants to read the latest version of a page,
-        # and therefore doesn't work with the test functions below to read an older
-        # page version.
-        config_lines=["neon.protocol_version=2"],
-    )
+    endpoint = env.endpoints.create_start("test_read_validation_neg")

    with closing(endpoint.connect()) as con:
        with con.cursor() as c:
@@ -157,7 +143,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
            log.info("read a page of a missing relation")
            try:
                c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))"
                )
                raise AssertionError("query should have failed")
            except UndefinedTable as e:
@@ -169,7 +155,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
            log.info("read a page at lsn 0")
            try:
                c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))"
                )
                raise AssertionError("query should have failed")
            except IoError as e:
@@ -178,22 +164,22 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
            log.info("Pass NULL as an input")
            expected = (None, None, None)
            c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))"
            )
            assert c.fetchone() == expected, "Expected null output"

            c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))"
            )
            assert c.fetchone() == expected, "Expected null output"

            c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))"
            )
            assert c.fetchone() == expected, "Expected null output"

            # This check is currently failing, reading beyond EOF is returning a 0-page
            log.info("Read beyond EOF")
            c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))"
            )
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -1,111 +0,0 @@
-import os
-import shutil
-from typing import Optional
-
-import pytest
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    S3Scrubber,
-)
-from fixtures.remote_storage import S3Storage, s3_storage
-from fixtures.types import TenantShardId
-from fixtures.workload import Workload
-
-
-@pytest.mark.parametrize("shard_count", [None, 4])
-def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
-    """
-    Test the `tenant-snapshot` subcommand, which grabs data from remote storage
-
-    This is only a support/debug tool, but worth testing to ensure the tool does not regress.
-    """
-
-    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
-    neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
-
-    env = neon_env_builder.init_start()
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    branch = "main"
-
-    # Do some work
-    workload = Workload(env, tenant_id, timeline_id, branch)
-    workload.init()
-
-    # Multiple write/flush passes to generate multiple layers
-    for _n in range(0, 3):
-        workload.write_rows(128)
-
-    # Do some more work after a restart, so that we have multiple generations
-    for pageserver in env.pageservers:
-        pageserver.stop()
-        pageserver.start()
-
-    for _n in range(0, 3):
-        workload.write_rows(128)
-
-    # If we're doing multiple shards, split: this is important to exercise
-    # the scrubber's ability to understand the references from child shards to parent shard's layers
-    if shard_count is not None:
-        tenant_shard_ids = env.storage_controller.tenant_shard_split(
-            tenant_id, shard_count=shard_count
-        )
-
-        # Write after shard split: this will result in shards containing a mixture of owned
-        # and parent layers in their index.
-        workload.write_rows(128)
-    else:
-        tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
-
-    output_path = neon_env_builder.test_output_dir / "snapshot"
-    os.makedirs(output_path)
-
-    scrubber = S3Scrubber(neon_env_builder)
-    scrubber.tenant_snapshot(tenant_id, output_path)
-
-    assert len(os.listdir(output_path)) > 0
-
-    workload.stop()
-
-    # Stop pageservers
-    for pageserver in env.pageservers:
-        pageserver.stop()
-
-    # Drop all shards' local storage
-    for tenant_shard_id in tenant_shard_ids:
-        pageserver = env.get_tenant_pageserver(tenant_shard_id)
-        shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id))
-
-    # Replace remote storage contents with the snapshot we downloaded
-    assert isinstance(env.pageserver_remote_storage, S3Storage)
-
-    remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id)
-
-    # Delete current remote storage contents
-    bucket = env.pageserver_remote_storage.bucket_name
-    remote_client = env.pageserver_remote_storage.client
-    deleted = 0
-    for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[
-        "Contents"
-    ]:
-        key = object["Key"]
-        remote_client.delete_object(Key=key, Bucket=bucket)
-        deleted += 1
-    assert deleted > 0
-
-    # Upload from snapshot
-    for root, _dirs, files in os.walk(output_path):
-        for file in files:
-            full_local_path = os.path.join(root, file)
-            full_remote_path = (
-                env.pageserver_remote_storage.tenants_path()
-                + "/"
-                + full_local_path.removeprefix(f"{output_path}/")
-            )
-            remote_client.upload_file(full_local_path, bucket, full_remote_path)
-
-    for pageserver in env.pageservers:
-        pageserver.start()
-
-    # Check we can read everything
-    workload.validate()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	9c097aa75f	Revised fragmentation logic	2024-04-25 18:15:26 +01:00
John Spray	1330a60d27	pull key diff into helper	2024-04-25 18:05:59 +01:00
John Spray	5c1135afcd	tests: add test_sharding_compaction	2024-04-25 18:05:59 +01:00
John Spray	4316f0fab2	Tidy up old size code	2024-04-25 18:05:59 +01:00
John Spray	987bfa23e1	pageserver: use shard-aware partitioning in tiered compaction	2024-04-25 18:04:23 +01:00
John Spray	43ec37adf6	pageserver: shard-aware keyspace partitioning	2024-04-25 18:04:23 +01:00